File size: 2,527 Bytes
91e5c50 3eec48a c10800e 3eec48a c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 91e5c50 c10800e 3e8163e c10800e 3e8163e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | # app.py
import gradio as gr
import tempfile
import pandas as pd
import os
import sys
import subprocess
# Instalar dependências
subprocess.run(["pip", "install", "gradio", "pandas", "transformers", "torch"], check=True)
# Instalar docling-core diretamente do GitHub
subprocess.run(["pip", "install", "git+https://github.com/docling-project/docling-core.git"], check=True)
# Definir variável de ambiente para os modelos
os.environ["DOCLING_MODEL_HOME"] = os.path.expanduser("~/.docling/models")
# Se necessário, criar o diretório de modelos
model_dir = os.path.expanduser("~/.docling/models/tableformer/accurate")
os.makedirs(model_dir, exist_ok=True)
# Importar após a instalação
from docling_core.pipelines.table import TableExtractionPipeline
from docling_core.models import ModelManager
def process_pdf(file):
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf:
tmp_pdf.write(file.read())
pdf_path = tmp_pdf.name
manager = ModelManager(auto_download=True)
pipeline = TableExtractionPipeline(model_manager=manager)
result = pipeline.run(pdf_path)
tables = result["tables"]
outputs = []
if not tables:
return "Nenhuma tabela detectada.", None
for i, table in enumerate(tables):
df = pd.DataFrame(table.rows)
csv_path = f"tabela_{i+1}.csv"
df.to_csv(csv_path, index=False)
outputs.append((f"Tabela {i+1} (página {table.page_number})", df))
os.unlink(pdf_path)
return "Tabelas extraídas com sucesso!", outputs
def show_tables(file):
status, results = process_pdf(file)
if not results:
return status, None, None, None, None
views = [None, None, None, None]
for i, (title, df) in enumerate(results[:4]):
views[i] = (gr.Markdown(f"### {title}"), gr.Dataframe(df))
return (status,) + tuple(x for pair in views if pair for x in pair)
with gr.Blocks() as demo:
gr.Markdown("# 🧾 TableFormer via Docling")
with gr.Row():
file = gr.File(label="Envie o PDF do balancete", file_types=[".pdf"])
btn = gr.Button("Processar")
status = gr.Textbox(label="Status")
output1_md = gr.Markdown(visible=False)
output1_df = gr.Dataframe(visible=False)
output2_md = gr.Markdown(visible=False)
output2_df = gr.Dataframe(visible=False)
btn.click(
fn=show_tables,
inputs=file,
outputs=[status, output1_md, output1_df, output2_md, output2_df]
)
demo.launch() |