| | |
| | import gradio as gr |
| | import tempfile |
| | import pandas as pd |
| | import os |
| | import sys |
| | import subprocess |
| |
|
| | |
| | subprocess.run(["pip", "install", "gradio", "pandas", "transformers", "torch"], check=True) |
| |
|
| | |
| | subprocess.run(["pip", "install", "git+https://github.com/docling-project/docling-core.git"], check=True) |
| |
|
| | |
| | os.environ["DOCLING_MODEL_HOME"] = os.path.expanduser("~/.docling/models") |
| |
|
| | |
| | model_dir = os.path.expanduser("~/.docling/models/tableformer/accurate") |
| | os.makedirs(model_dir, exist_ok=True) |
| |
|
| | |
| | from docling_core.pipelines.table import TableExtractionPipeline |
| | from docling_core.models import ModelManager |
| |
|
| | def process_pdf(file): |
| | with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp_pdf: |
| | tmp_pdf.write(file.read()) |
| | pdf_path = tmp_pdf.name |
| | |
| | manager = ModelManager(auto_download=True) |
| | pipeline = TableExtractionPipeline(model_manager=manager) |
| | result = pipeline.run(pdf_path) |
| | tables = result["tables"] |
| | |
| | outputs = [] |
| | if not tables: |
| | return "Nenhuma tabela detectada.", None |
| | |
| | for i, table in enumerate(tables): |
| | df = pd.DataFrame(table.rows) |
| | csv_path = f"tabela_{i+1}.csv" |
| | df.to_csv(csv_path, index=False) |
| | outputs.append((f"Tabela {i+1} (página {table.page_number})", df)) |
| | |
| | os.unlink(pdf_path) |
| | return "Tabelas extraídas com sucesso!", outputs |
| |
|
| | def show_tables(file): |
| | status, results = process_pdf(file) |
| | if not results: |
| | return status, None, None, None, None |
| | |
| | views = [None, None, None, None] |
| | for i, (title, df) in enumerate(results[:4]): |
| | views[i] = (gr.Markdown(f"### {title}"), gr.Dataframe(df)) |
| | |
| | return (status,) + tuple(x for pair in views if pair for x in pair) |
| |
|
| | with gr.Blocks() as demo: |
| | gr.Markdown("# 🧾 TableFormer via Docling") |
| | |
| | with gr.Row(): |
| | file = gr.File(label="Envie o PDF do balancete", file_types=[".pdf"]) |
| | btn = gr.Button("Processar") |
| | |
| | status = gr.Textbox(label="Status") |
| | output1_md = gr.Markdown(visible=False) |
| | output1_df = gr.Dataframe(visible=False) |
| | output2_md = gr.Markdown(visible=False) |
| | output2_df = gr.Dataframe(visible=False) |
| | |
| | btn.click( |
| | fn=show_tables, |
| | inputs=file, |
| | outputs=[status, output1_md, output1_df, output2_md, output2_df] |
| | ) |
| |
|
| | demo.launch() |