File size: 2,527 Bytes
91e5c50
 
 
 
 
3eec48a
 
 
c10800e
 
 
 
 
 
 
 
3eec48a
c10800e
 
 
91e5c50
c10800e
91e5c50
 
 
 
 
 
 
c10800e
91e5c50
 
 
 
c10800e
91e5c50
 
c10800e
 
 
91e5c50
c10800e
 
 
 
91e5c50
c10800e
 
 
 
 
 
 
 
 
 
 
 
91e5c50
 
 
c10800e
91e5c50
 
 
c10800e
91e5c50
c10800e
 
 
 
 
3e8163e
c10800e
 
 
3e8163e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# app.py
import gradio as gr
import tempfile
import pandas as pd
import os
import sys
import subprocess

# Instalar dependências
subprocess.run(["pip", "install", "gradio", "pandas", "transformers", "torch"], check=True)

# Instalar docling-core diretamente do GitHub
subprocess.run(["pip", "install", "git+https://github.com/docling-project/docling-core.git"], check=True)

# Definir variável de ambiente para os modelos
os.environ["DOCLING_MODEL_HOME"] = os.path.expanduser("~/.docling/models")

# Se necessário, criar o diretório de modelos
model_dir = os.path.expanduser("~/.docling/models/tableformer/accurate")
os.makedirs(model_dir, exist_ok=True)

# Importar após a instalação
from docling_core.pipelines.table import TableExtractionPipeline
from docling_core.models import ModelManager

def process_pdf(file):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
        tmp_pdf.write(file.read())
        pdf_path = tmp_pdf.name
    
    manager = ModelManager(auto_download=True)
    pipeline = TableExtractionPipeline(model_manager=manager)
    result = pipeline.run(pdf_path)
    tables = result["tables"]
    
    outputs = []
    if not tables:
        return "Nenhuma tabela detectada.", None
    
    for i, table in enumerate(tables):
        df = pd.DataFrame(table.rows)
        csv_path = f"tabela_{i+1}.csv"
        df.to_csv(csv_path, index=False)
        outputs.append((f"Tabela {i+1} (página {table.page_number})", df))
    
    os.unlink(pdf_path)
    return "Tabelas extraídas com sucesso!", outputs

def show_tables(file):
    status, results = process_pdf(file)
    if not results:
        return status, None, None, None, None
    
    views = [None, None, None, None]
    for i, (title, df) in enumerate(results[:4]):
        views[i] = (gr.Markdown(f"### {title}"), gr.Dataframe(df))
    
    return (status,) + tuple(x for pair in views if pair for x in pair)

with gr.Blocks() as demo:
    gr.Markdown("# 🧾 TableFormer via Docling")
    
    with gr.Row():
        file = gr.File(label="Envie o PDF do balancete", file_types=[".pdf"])
        btn = gr.Button("Processar")
    
    status = gr.Textbox(label="Status")
    output1_md = gr.Markdown(visible=False)
    output1_df = gr.Dataframe(visible=False)
    output2_md = gr.Markdown(visible=False)
    output2_df = gr.Dataframe(visible=False)
    
    btn.click(
        fn=show_tables, 
        inputs=file, 
        outputs=[status, output1_md, output1_df, output2_md, output2_df]
    )

demo.launch()