Commit
·
419fb7d
1
Parent(s):
c2d2ccd
Allow for tesseract to run OCR in line-level mode and then query LLM with line-level data. Added option for running as MCP server, added api for multi-word text search
Browse files- README.md +4 -4
- app.py +6 -2
- pyproject.toml +6 -7
- tools/config.py +11 -1
- tools/custom_image_analyser_engine.py +114 -11
- tools/word_segmenter.py +7 -8
README.md
CHANGED
|
@@ -10,11 +10,13 @@ license: agpl-3.0
|
|
| 10 |
---
|
| 11 |
# Document redaction
|
| 12 |
|
| 13 |
-
version: 1.5.
|
| 14 |
|
| 15 |
Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
|
| 16 |
|
| 17 |
-
To identify text in documents, the 'Local' text extraction uses PikePDF, and OCR image analysis uses Tesseract, and works well only for documents with typed text or scanned PDFs with clear text. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text.
|
|
|
|
|
|
|
| 18 |
|
| 19 |
Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
|
| 20 |
|
|
@@ -589,8 +591,6 @@ The workflow is designed to be simple: **Search → Select → Redact**.
|
|
| 589 |
|
| 590 |
#### **Step 1: Search for Text**
|
| 591 |
|
| 592 |
-
#### **Step 1: Search for Text**
|
| 593 |
-
|
| 594 |
1. Navigate to the **"Search text to make new redactions"** tab.
|
| 595 |
2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
|
| 596 |
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
|
|
|
|
| 10 |
---
|
| 11 |
# Document redaction
|
| 12 |
|
| 13 |
+
version: 1.5.2
|
| 14 |
|
| 15 |
Redact personally identifiable information (PII) from documents (pdf, png, jpg), Word files (docx), or tabular data (xlsx/csv/parquet). Please see the [User Guide](#user-guide) for a full walkthrough of all the features in the app.
|
| 16 |
|
| 17 |
+
To identify text in documents, the 'Local' text extraction uses PikePDF, and OCR image analysis uses Tesseract, and works well only for documents with typed text or scanned PDFs with clear text. Use AWS Textract to extract more complex elements e.g. handwriting, signatures, or unclear text. PaddleOCR and VLM support is also provided (see the installation instructions below).
|
| 18 |
+
|
| 19 |
+
For PII identification, 'Local' (based on spaCy) gives good results if you are looking for common names or terms, or a custom list of terms to redact (see Redaction settings). AWS Comprehend gives better results at a small cost.
|
| 20 |
|
| 21 |
Additional options on the 'Redaction settings' include, the type of information to redact (e.g. people, places), custom terms to include/ exclude from redaction, fuzzy matching, language settings, and whole page redaction. After redaction is complete, you can view and modify suggested redactions on the 'Review redactions' tab to quickly create a final redacted document.
|
| 22 |
|
|
|
|
| 591 |
|
| 592 |
#### **Step 1: Search for Text**
|
| 593 |
|
|
|
|
|
|
|
| 594 |
1. Navigate to the **"Search text to make new redactions"** tab.
|
| 595 |
2. The main table will initially be populated with all the text extracted from the document for a page, broken down by word.
|
| 596 |
3. To narrow this down, use the **"Multi-word text search"** box to type the word or phrase you want to find (this will search the whole document). If you want to do a regex-based search, tick the 'Enable regex pattern matching' box under 'Search options' below (Note this will only be able to search for patterns in text within each cell).
|
app.py
CHANGED
|
@@ -119,6 +119,7 @@ from tools.config import (
|
|
| 119 |
RUN_AWS_FUNCTIONS,
|
| 120 |
RUN_DIRECT_MODE,
|
| 121 |
RUN_FASTAPI,
|
|
|
|
| 122 |
S3_ACCESS_LOGS_FOLDER,
|
| 123 |
S3_ALLOW_LIST_PATH,
|
| 124 |
S3_COST_CODES_PATH,
|
|
@@ -1258,7 +1259,7 @@ with blocks:
|
|
| 1258 |
open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
|
| 1259 |
):
|
| 1260 |
local_ocr_method_radio = gr.Radio(
|
| 1261 |
-
label="""Choose local OCR model. "tesseract" is the default and will work for
|
| 1262 |
value=CHOSEN_LOCAL_OCR_MODEL,
|
| 1263 |
choices=LOCAL_OCR_MODEL_OPTIONS,
|
| 1264 |
interactive=True,
|
|
@@ -4755,7 +4756,7 @@ with blocks:
|
|
| 4755 |
duplicate_files_out,
|
| 4756 |
full_duplicate_data_by_file,
|
| 4757 |
],
|
| 4758 |
-
)
|
| 4759 |
|
| 4760 |
# Clicking on a cell in the redact items table will take you to that page
|
| 4761 |
all_page_line_level_ocr_results_with_words_df.select(
|
|
@@ -6549,6 +6550,7 @@ with blocks:
|
|
| 6549 |
max_file_size=MAX_FILE_SIZE,
|
| 6550 |
path=FASTAPI_ROOT_PATH,
|
| 6551 |
favicon_path=Path(FAVICON_PATH),
|
|
|
|
| 6552 |
)
|
| 6553 |
|
| 6554 |
# Example command to run in uvicorn (in python): uvicorn.run("app:app", host=GRADIO_SERVER_NAME, port=GRADIO_SERVER_PORT)
|
|
@@ -6566,6 +6568,7 @@ with blocks:
|
|
| 6566 |
server_port=GRADIO_SERVER_PORT,
|
| 6567 |
root_path=ROOT_PATH,
|
| 6568 |
favicon_path=Path(FAVICON_PATH),
|
|
|
|
| 6569 |
)
|
| 6570 |
else:
|
| 6571 |
blocks.launch(
|
|
@@ -6576,6 +6579,7 @@ with blocks:
|
|
| 6576 |
server_port=GRADIO_SERVER_PORT,
|
| 6577 |
root_path=ROOT_PATH,
|
| 6578 |
favicon_path=Path(FAVICON_PATH),
|
|
|
|
| 6579 |
)
|
| 6580 |
|
| 6581 |
else:
|
|
|
|
| 119 |
RUN_AWS_FUNCTIONS,
|
| 120 |
RUN_DIRECT_MODE,
|
| 121 |
RUN_FASTAPI,
|
| 122 |
+
RUN_MCP_SERVER,
|
| 123 |
S3_ACCESS_LOGS_FOLDER,
|
| 124 |
S3_ALLOW_LIST_PATH,
|
| 125 |
S3_COST_CODES_PATH,
|
|
|
|
| 1259 |
open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
|
| 1260 |
):
|
| 1261 |
local_ocr_method_radio = gr.Radio(
|
| 1262 |
+
label="""Choose a local OCR model. "tesseract" is the default and will work for documents with clear typed text. "paddle" is more accurate for text extraction where the text is not clear or well-formatted, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. The hybrid models will do a first pass with one model, and a second pass on words/phrases with low confidence with a more powerful model. "hybrid-paddle" will do the first pass with Tesseract, and the second with PaddleOCR. "hybrid-vlm" is a combination of Tesseract for OCR, and a second pass with the chosen vision model (VLM). "hybrid-paddle-vlm" is a combination of PaddleOCR with the chosen VLM.""",
|
| 1263 |
value=CHOSEN_LOCAL_OCR_MODEL,
|
| 1264 |
choices=LOCAL_OCR_MODEL_OPTIONS,
|
| 1265 |
interactive=True,
|
|
|
|
| 4756 |
duplicate_files_out,
|
| 4757 |
full_duplicate_data_by_file,
|
| 4758 |
],
|
| 4759 |
+
api_name="word_level_ocr_text_search")
|
| 4760 |
|
| 4761 |
# Clicking on a cell in the redact items table will take you to that page
|
| 4762 |
all_page_line_level_ocr_results_with_words_df.select(
|
|
|
|
| 6550 |
max_file_size=MAX_FILE_SIZE,
|
| 6551 |
path=FASTAPI_ROOT_PATH,
|
| 6552 |
favicon_path=Path(FAVICON_PATH),
|
| 6553 |
+
mcp_server=RUN_MCP_SERVER,
|
| 6554 |
)
|
| 6555 |
|
| 6556 |
# Example command to run in uvicorn (in python): uvicorn.run("app:app", host=GRADIO_SERVER_NAME, port=GRADIO_SERVER_PORT)
|
|
|
|
| 6568 |
server_port=GRADIO_SERVER_PORT,
|
| 6569 |
root_path=ROOT_PATH,
|
| 6570 |
favicon_path=Path(FAVICON_PATH),
|
| 6571 |
+
mcp_server=RUN_MCP_SERVER,
|
| 6572 |
)
|
| 6573 |
else:
|
| 6574 |
blocks.launch(
|
|
|
|
| 6579 |
server_port=GRADIO_SERVER_PORT,
|
| 6580 |
root_path=ROOT_PATH,
|
| 6581 |
favicon_path=Path(FAVICON_PATH),
|
| 6582 |
+
mcp_server=RUN_MCP_SERVER,
|
| 6583 |
)
|
| 6584 |
|
| 6585 |
else:
|
pyproject.toml
CHANGED
|
@@ -2,17 +2,16 @@
|
|
| 2 |
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
[project]
|
| 6 |
name = "doc_redaction"
|
| 7 |
-
version = "1.5.
|
| 8 |
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
|
| 9 |
readme = "README.md"
|
| 10 |
requires-python = ">=3.10"
|
| 11 |
-
|
| 12 |
-
[project.urls]
|
| 13 |
-
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
|
| 14 |
-
Repository = "https://github.com/seanpedrick-case/doc_redaction"
|
| 15 |
-
|
| 16 |
dependencies = [
|
| 17 |
"pdfminer.six==20250506",
|
| 18 |
"pdf2image==1.17.0",
|
|
@@ -61,7 +60,7 @@ paddle = [
|
|
| 61 |
# Extra dependencies for VLM models
|
| 62 |
# For torch you should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
|
| 63 |
vlm = [
|
| 64 |
-
"torch
|
| 65 |
"torchvision>=0.20.1",
|
| 66 |
"transformers==4.57.1",
|
| 67 |
"accelerate==1.11.0",
|
|
|
|
| 2 |
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
build-backend = "setuptools.build_meta"
|
| 4 |
|
| 5 |
+
[project.urls]
|
| 6 |
+
Homepage = "https://seanpedrick-case.github.io/doc_redaction/"
|
| 7 |
+
Repository = "https://github.com/seanpedrick-case/doc_redaction"
|
| 8 |
+
|
| 9 |
[project]
|
| 10 |
name = "doc_redaction"
|
| 11 |
+
version = "1.5.2"
|
| 12 |
description = "Redact PDF/image-based documents, Word, or CSV/XLSX files using a Gradio-based GUI interface"
|
| 13 |
readme = "README.md"
|
| 14 |
requires-python = ">=3.10"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
dependencies = [
|
| 16 |
"pdfminer.six==20250506",
|
| 17 |
"pdf2image==1.17.0",
|
|
|
|
| 60 |
# Extra dependencies for VLM models
|
| 61 |
# For torch you should use --index-url https://download.pytorch.org/whl/cu126 for cuda support for paddleocr, need to install manually
|
| 62 |
vlm = [
|
| 63 |
+
"torch>=2.5.1,<=2.8.0",
|
| 64 |
"torchvision>=0.20.1",
|
| 65 |
"transformers==4.57.1",
|
| 66 |
"accelerate==1.11.0",
|
tools/config.py
CHANGED
|
@@ -281,6 +281,8 @@ FAVICON_PATH = get_or_create_env_var("FAVICON_PATH", "favicon.png")
|
|
| 281 |
|
| 282 |
RUN_FASTAPI = convert_string_to_boolean(get_or_create_env_var("RUN_FASTAPI", "False"))
|
| 283 |
|
|
|
|
|
|
|
| 284 |
MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
|
| 285 |
|
| 286 |
MAX_FILE_SIZE = get_or_create_env_var("MAX_FILE_SIZE", "250mb").lower()
|
|
@@ -492,7 +494,7 @@ OVERWRITE_EXISTING_OCR_RESULTS = convert_string_to_boolean(
|
|
| 492 |
### Local OCR model - Tesseract vs PaddleOCR
|
| 493 |
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
|
| 494 |
"CHOSEN_LOCAL_OCR_MODEL", "tesseract"
|
| 495 |
-
) #
|
| 496 |
|
| 497 |
SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
|
| 498 |
get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
|
|
@@ -525,6 +527,10 @@ HYBRID_OCR_PADDING = int(
|
|
| 525 |
get_or_create_env_var("HYBRID_OCR_PADDING", "1")
|
| 526 |
) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
|
| 527 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
TESSERACT_SEGMENTATION_LEVEL = int(
|
| 529 |
get_or_create_env_var("TESSERACT_SEGMENTATION_LEVEL", "11")
|
| 530 |
) # Tesseract segmentation level: PSM level to use for Tesseract OCR
|
|
@@ -553,6 +559,10 @@ SAVE_PAGE_OCR_VISUALISATIONS = convert_string_to_boolean(
|
|
| 553 |
get_or_create_env_var("SAVE_PAGE_OCR_VISUALISATIONS", "False")
|
| 554 |
) # Whether to save visualisations of Tesseract, PaddleOCR, and Textract bounding boxes.
|
| 555 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 556 |
# Model storage paths for Lambda compatibility
|
| 557 |
PADDLE_MODEL_PATH = get_or_create_env_var(
|
| 558 |
"PADDLE_MODEL_PATH", ""
|
|
|
|
| 281 |
|
| 282 |
RUN_FASTAPI = convert_string_to_boolean(get_or_create_env_var("RUN_FASTAPI", "False"))
|
| 283 |
|
| 284 |
+
RUN_MCP_SERVER = convert_string_to_boolean(get_or_create_env_var("RUN_MCP_SERVER", "False"))
|
| 285 |
+
|
| 286 |
MAX_QUEUE_SIZE = int(get_or_create_env_var("MAX_QUEUE_SIZE", "5"))
|
| 287 |
|
| 288 |
MAX_FILE_SIZE = get_or_create_env_var("MAX_FILE_SIZE", "250mb").lower()
|
|
|
|
| 494 |
### Local OCR model - Tesseract vs PaddleOCR
|
| 495 |
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
|
| 496 |
"CHOSEN_LOCAL_OCR_MODEL", "tesseract"
|
| 497 |
+
) # "tesseract" is the default and will work for documents with clear typed text. "paddle" is more accurate for text extraction where the text is not clear or well-formatted, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. The hybrid models will do a first pass with one model, and a second pass on words/phrases with low confidence with a more powerful model. "hybrid-paddle" will do the first pass with Tesseract, and the second with PaddleOCR. "hybrid-vlm" is a combination of Tesseract for OCR, and a second pass with the chosen vision model (VLM). "hybrid-paddle-vlm" is a combination of PaddleOCR with the chosen VLM.
|
| 498 |
|
| 499 |
SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
|
| 500 |
get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
|
|
|
|
| 527 |
get_or_create_env_var("HYBRID_OCR_PADDING", "1")
|
| 528 |
) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
|
| 529 |
|
| 530 |
+
TESSERACT_WORD_LEVEL_OCR = convert_string_to_boolean(
|
| 531 |
+
get_or_create_env_var("TESSERACT_WORD_LEVEL_OCR", "True")
|
| 532 |
+
) # Whether to use Tesseract word-level OCR.
|
| 533 |
+
|
| 534 |
TESSERACT_SEGMENTATION_LEVEL = int(
|
| 535 |
get_or_create_env_var("TESSERACT_SEGMENTATION_LEVEL", "11")
|
| 536 |
) # Tesseract segmentation level: PSM level to use for Tesseract OCR
|
|
|
|
| 559 |
get_or_create_env_var("SAVE_PAGE_OCR_VISUALISATIONS", "False")
|
| 560 |
) # Whether to save visualisations of Tesseract, PaddleOCR, and Textract bounding boxes.
|
| 561 |
|
| 562 |
+
SAVE_WORD_SEGMENTER_OUTPUT_IMAGES = convert_string_to_boolean(
|
| 563 |
+
get_or_create_env_var("SAVE_WORD_SEGMENTER_OUTPUT_IMAGES", "False")
|
| 564 |
+
) # Whether to save output images from the word segmenter.
|
| 565 |
+
|
| 566 |
# Model storage paths for Lambda compatibility
|
| 567 |
PADDLE_MODEL_PATH = get_or_create_env_var(
|
| 568 |
"PADDLE_MODEL_PATH", ""
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -10,6 +10,7 @@ import botocore
|
|
| 10 |
import cv2
|
| 11 |
import gradio as gr
|
| 12 |
import numpy as np
|
|
|
|
| 13 |
import pytesseract
|
| 14 |
from pdfminer.layout import LTChar
|
| 15 |
from PIL import Image
|
|
@@ -34,6 +35,7 @@ from tools.config import (
|
|
| 34 |
SAVE_VLM_INPUT_IMAGES,
|
| 35 |
SELECTED_MODEL,
|
| 36 |
TESSERACT_SEGMENTATION_LEVEL,
|
|
|
|
| 37 |
VLM_MAX_DPI,
|
| 38 |
VLM_MAX_IMAGE_SIZE,
|
| 39 |
)
|
|
@@ -1238,11 +1240,13 @@ class CustomImageAnalyzerEngine:
|
|
| 1238 |
print(
|
| 1239 |
f"Warning: Image dimension mismatch! Expected {image_width}x{image_height}, but got {actual_width}x{actual_height}"
|
| 1240 |
)
|
| 1241 |
-
print(f"Using actual dimensions: {actual_width}x{actual_height}")
|
| 1242 |
# Update to use actual dimensions
|
| 1243 |
image_width = actual_width
|
| 1244 |
image_height = actual_height
|
| 1245 |
|
|
|
|
|
|
|
| 1246 |
segmenter = AdaptiveSegmenter(output_folder=self.output_folder)
|
| 1247 |
|
| 1248 |
# Process each line
|
|
@@ -1591,6 +1595,30 @@ class CustomImageAnalyzerEngine:
|
|
| 1591 |
1,
|
| 1592 |
)
|
| 1593 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1594 |
def _perform_hybrid_ocr(
|
| 1595 |
self,
|
| 1596 |
image: Image.Image,
|
|
@@ -1600,8 +1628,22 @@ class CustomImageAnalyzerEngine:
|
|
| 1600 |
image_name: str = "unknown_image_name",
|
| 1601 |
) -> Dict[str, list]:
|
| 1602 |
"""
|
| 1603 |
-
Performs OCR using Tesseract for
|
| 1604 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1605 |
"""
|
| 1606 |
# Determine if we're using VLM or PaddleOCR
|
| 1607 |
use_vlm = self.ocr_engine == "hybrid-vlm"
|
|
@@ -1615,15 +1657,37 @@ class CustomImageAnalyzerEngine:
|
|
| 1615 |
"No OCR object provided and 'paddle_ocr' is not initialized."
|
| 1616 |
)
|
| 1617 |
|
| 1618 |
-
print("Starting hybrid OCR process...")
|
| 1619 |
|
| 1620 |
-
# 1. Get initial word-level results from Tesseract
|
| 1621 |
tesseract_data = pytesseract.image_to_data(
|
| 1622 |
image,
|
| 1623 |
output_type=pytesseract.Output.DICT,
|
| 1624 |
config=self.tesseract_config,
|
| 1625 |
lang=self.tesseract_lang,
|
| 1626 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1627 |
|
| 1628 |
final_data = {
|
| 1629 |
"text": list(),
|
|
@@ -1708,7 +1772,7 @@ class CustomImageAnalyzerEngine:
|
|
| 1708 |
text, new_text, conf, new_conf, ocr_type
|
| 1709 |
)
|
| 1710 |
|
| 1711 |
-
if SAVE_EXAMPLE_HYBRID_IMAGES
|
| 1712 |
# Normalize and validate image_name to prevent path traversal attacks
|
| 1713 |
normalized_image_name = os.path.normpath(
|
| 1714 |
image_name + "_" + ocr_type
|
|
@@ -2196,6 +2260,28 @@ class CustomImageAnalyzerEngine:
|
|
| 2196 |
lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
|
| 2197 |
)
|
| 2198 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2199 |
elif self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
|
| 2200 |
|
| 2201 |
if ocr is None:
|
|
@@ -2371,13 +2457,15 @@ class CustomImageAnalyzerEngine:
|
|
| 2371 |
|
| 2372 |
# Convert line-level results to word-level if configured and needed
|
| 2373 |
if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
|
| 2374 |
-
#
|
|
|
|
| 2375 |
# Check if coordinates need to be scaled to match the image we're cropping from
|
| 2376 |
# For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space
|
| 2377 |
# - If PaddleOCR processed the original image (image_path provided), crop from original image (no scaling)
|
| 2378 |
# - If PaddleOCR processed the preprocessed image (no image_path), scale coordinates to preprocessed space and crop from preprocessed image
|
| 2379 |
-
# For Tesseract: OCR runs on preprocessed image
|
| 2380 |
-
#
|
|
|
|
| 2381 |
|
| 2382 |
needs_scaling = False
|
| 2383 |
crop_image = image # Default to preprocessed image
|
|
@@ -2405,6 +2493,19 @@ class CustomImageAnalyzerEngine:
|
|
| 2405 |
else:
|
| 2406 |
# PaddleOCR processed the preprocessed image, so scale coordinates to preprocessed space
|
| 2407 |
needs_scaling = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2408 |
|
| 2409 |
if needs_scaling:
|
| 2410 |
# Calculate scale factors from original to preprocessed
|
|
@@ -2488,7 +2589,8 @@ class CustomImageAnalyzerEngine:
|
|
| 2488 |
def get_model(idx):
|
| 2489 |
return default_model
|
| 2490 |
|
| 2491 |
-
|
|
|
|
| 2492 |
OCRResult(
|
| 2493 |
text=clean_unicode_text(ocr_result["text"][i]),
|
| 2494 |
left=ocr_result["left"][i],
|
|
@@ -2497,11 +2599,12 @@ class CustomImageAnalyzerEngine:
|
|
| 2497 |
height=ocr_result["height"][i],
|
| 2498 |
conf=round(float(ocr_result["conf"][i]), 0),
|
| 2499 |
model=get_model(i),
|
| 2500 |
-
# line_number=ocr_result['abs_line_id'][i]
|
| 2501 |
)
|
| 2502 |
for i in valid_indices
|
| 2503 |
]
|
| 2504 |
|
|
|
|
|
|
|
| 2505 |
def analyze_text(
|
| 2506 |
self,
|
| 2507 |
line_level_ocr_results: List[OCRResult],
|
|
|
|
| 10 |
import cv2
|
| 11 |
import gradio as gr
|
| 12 |
import numpy as np
|
| 13 |
+
import pandas as pd
|
| 14 |
import pytesseract
|
| 15 |
from pdfminer.layout import LTChar
|
| 16 |
from PIL import Image
|
|
|
|
| 35 |
SAVE_VLM_INPUT_IMAGES,
|
| 36 |
SELECTED_MODEL,
|
| 37 |
TESSERACT_SEGMENTATION_LEVEL,
|
| 38 |
+
TESSERACT_WORD_LEVEL_OCR,
|
| 39 |
VLM_MAX_DPI,
|
| 40 |
VLM_MAX_IMAGE_SIZE,
|
| 41 |
)
|
|
|
|
| 1240 |
print(
|
| 1241 |
f"Warning: Image dimension mismatch! Expected {image_width}x{image_height}, but got {actual_width}x{actual_height}"
|
| 1242 |
)
|
| 1243 |
+
#print(f"Using actual dimensions: {actual_width}x{actual_height}")
|
| 1244 |
# Update to use actual dimensions
|
| 1245 |
image_width = actual_width
|
| 1246 |
image_height = actual_height
|
| 1247 |
|
| 1248 |
+
print("segmenting line-level OCR results to word-level...")
|
| 1249 |
+
|
| 1250 |
segmenter = AdaptiveSegmenter(output_folder=self.output_folder)
|
| 1251 |
|
| 1252 |
# Process each line
|
|
|
|
| 1595 |
1,
|
| 1596 |
)
|
| 1597 |
|
| 1598 |
+
# Calculate line-level bounding boxes and average confidence
|
| 1599 |
+
def _calculate_line_bbox(self, group):
|
| 1600 |
+
# Get the leftmost and rightmost positions
|
| 1601 |
+
left = group['left'].min()
|
| 1602 |
+
top = group['top'].min()
|
| 1603 |
+
right = (group['left'] + group['width']).max()
|
| 1604 |
+
bottom = (group['top'] + group['height']).max()
|
| 1605 |
+
|
| 1606 |
+
# Calculate width and height
|
| 1607 |
+
width = right - left
|
| 1608 |
+
height = bottom - top
|
| 1609 |
+
|
| 1610 |
+
# Calculate average confidence
|
| 1611 |
+
avg_conf = round(group['conf'].mean(), 0)
|
| 1612 |
+
|
| 1613 |
+
return pd.Series({
|
| 1614 |
+
'text': ' '.join(group['text'].astype(str).tolist()),
|
| 1615 |
+
'left': left,
|
| 1616 |
+
'top': top,
|
| 1617 |
+
'width': width,
|
| 1618 |
+
'height': height,
|
| 1619 |
+
'conf': avg_conf
|
| 1620 |
+
})
|
| 1621 |
+
|
| 1622 |
def _perform_hybrid_ocr(
|
| 1623 |
self,
|
| 1624 |
image: Image.Image,
|
|
|
|
| 1628 |
image_name: str = "unknown_image_name",
|
| 1629 |
) -> Dict[str, list]:
|
| 1630 |
"""
|
| 1631 |
+
Performs hybrid OCR on an image using Tesseract for initial OCR and PaddleOCR/VLM to enhance
|
| 1632 |
+
results for low-confidence or uncertain words.
|
| 1633 |
+
|
| 1634 |
+
Args:
|
| 1635 |
+
image (Image.Image): The input image (PIL format) to be processed.
|
| 1636 |
+
confidence_threshold (int, optional): Tesseract confidence threshold below which words are
|
| 1637 |
+
re-analyzed with secondary OCR (PaddleOCR/VLM). Defaults to HYBRID_OCR_CONFIDENCE_THRESHOLD.
|
| 1638 |
+
padding (int, optional): Pixel padding (in all directions) to add around each word box when
|
| 1639 |
+
cropping for secondary OCR. Defaults to HYBRID_OCR_PADDING.
|
| 1640 |
+
ocr (Optional[Any], optional): An instance of the PaddleOCR or VLM engine. If None, will use the
|
| 1641 |
+
instance's `paddle_ocr` attribute if available. Only necessary for PaddleOCR-based pipelines.
|
| 1642 |
+
image_name (str, optional): Optional name of the image, useful for debugging and visualization.
|
| 1643 |
+
|
| 1644 |
+
Returns:
|
| 1645 |
+
Dict[str, list]: OCR results in the dictionary format of pytesseract.image_to_data (keys:
|
| 1646 |
+
'text', 'left', 'top', 'width', 'height', 'conf', 'model', ...).
|
| 1647 |
"""
|
| 1648 |
# Determine if we're using VLM or PaddleOCR
|
| 1649 |
use_vlm = self.ocr_engine == "hybrid-vlm"
|
|
|
|
| 1657 |
"No OCR object provided and 'paddle_ocr' is not initialized."
|
| 1658 |
)
|
| 1659 |
|
| 1660 |
+
#print("Starting hybrid OCR process...")
|
| 1661 |
|
| 1662 |
+
# 1. Get initial word-level results from Tesseract
|
| 1663 |
tesseract_data = pytesseract.image_to_data(
|
| 1664 |
image,
|
| 1665 |
output_type=pytesseract.Output.DICT,
|
| 1666 |
config=self.tesseract_config,
|
| 1667 |
lang=self.tesseract_lang,
|
| 1668 |
)
|
| 1669 |
+
|
| 1670 |
+
if TESSERACT_WORD_LEVEL_OCR is False:
|
| 1671 |
+
ocr_df = pd.DataFrame(tesseract_data)
|
| 1672 |
+
|
| 1673 |
+
# Filter out invalid entries (confidence == -1)
|
| 1674 |
+
ocr_df = ocr_df[ocr_df.conf != -1]
|
| 1675 |
+
|
| 1676 |
+
# Group by line and aggregate text
|
| 1677 |
+
line_groups = ocr_df.groupby(['block_num', 'par_num', 'line_num'])
|
| 1678 |
+
|
| 1679 |
+
ocr_data = line_groups.apply(self._calculate_line_bbox).reset_index()
|
| 1680 |
+
|
| 1681 |
+
# Overwrite tesseract_data with the aggregated data
|
| 1682 |
+
tesseract_data = {
|
| 1683 |
+
'text': ocr_data['text'].tolist(),
|
| 1684 |
+
'left': ocr_data['left'].astype(int).tolist(),
|
| 1685 |
+
'top': ocr_data['top'].astype(int).tolist(),
|
| 1686 |
+
'width': ocr_data['width'].astype(int).tolist(),
|
| 1687 |
+
'height': ocr_data['height'].astype(int).tolist(),
|
| 1688 |
+
'conf': ocr_data['conf'].tolist(),
|
| 1689 |
+
'model': ['Tesseract'] * len(ocr_data) # Add model field
|
| 1690 |
+
}
|
| 1691 |
|
| 1692 |
final_data = {
|
| 1693 |
"text": list(),
|
|
|
|
| 1772 |
text, new_text, conf, new_conf, ocr_type
|
| 1773 |
)
|
| 1774 |
|
| 1775 |
+
if SAVE_EXAMPLE_HYBRID_IMAGES:
|
| 1776 |
# Normalize and validate image_name to prevent path traversal attacks
|
| 1777 |
normalized_image_name = os.path.normpath(
|
| 1778 |
image_name + "_" + ocr_type
|
|
|
|
| 2260 |
lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
|
| 2261 |
)
|
| 2262 |
|
| 2263 |
+
if TESSERACT_WORD_LEVEL_OCR is False:
|
| 2264 |
+
ocr_df = pd.DataFrame(ocr_data)
|
| 2265 |
+
|
| 2266 |
+
# Filter out invalid entries (confidence == -1)
|
| 2267 |
+
ocr_df = ocr_df[ocr_df.conf != -1]
|
| 2268 |
+
|
| 2269 |
+
# Group by line and aggregate text
|
| 2270 |
+
line_groups = ocr_df.groupby(['block_num', 'par_num', 'line_num'])
|
| 2271 |
+
|
| 2272 |
+
ocr_data = line_groups.apply(self._calculate_line_bbox).reset_index()
|
| 2273 |
+
|
| 2274 |
+
# Convert DataFrame to dictionary of lists format expected by downstream code
|
| 2275 |
+
ocr_data = {
|
| 2276 |
+
'text': ocr_data['text'].tolist(),
|
| 2277 |
+
'left': ocr_data['left'].astype(int).tolist(),
|
| 2278 |
+
'top': ocr_data['top'].astype(int).tolist(),
|
| 2279 |
+
'width': ocr_data['width'].astype(int).tolist(),
|
| 2280 |
+
'height': ocr_data['height'].astype(int).tolist(),
|
| 2281 |
+
'conf': ocr_data['conf'].tolist(),
|
| 2282 |
+
'model': ['Tesseract'] * len(ocr_data) # Add model field
|
| 2283 |
+
}
|
| 2284 |
+
|
| 2285 |
elif self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle-vlm":
|
| 2286 |
|
| 2287 |
if ocr is None:
|
|
|
|
| 2457 |
|
| 2458 |
# Convert line-level results to word-level if configured and needed
|
| 2459 |
if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
|
| 2460 |
+
#print("Converting line-level OCR results to word-level...")
|
| 2461 |
+
|
| 2462 |
# Check if coordinates need to be scaled to match the image we're cropping from
|
| 2463 |
# For PaddleOCR: _convert_paddle_to_tesseract_format converts coordinates to original image space
|
| 2464 |
# - If PaddleOCR processed the original image (image_path provided), crop from original image (no scaling)
|
| 2465 |
# - If PaddleOCR processed the preprocessed image (no image_path), scale coordinates to preprocessed space and crop from preprocessed image
|
| 2466 |
+
# For Tesseract: OCR runs on preprocessed image
|
| 2467 |
+
# - If scale_factor != 1.0, rescale_ocr_data converted coordinates to original space, so crop from original image
|
| 2468 |
+
# - If scale_factor == 1.0, coordinates are still in preprocessed space, so crop from preprocessed image
|
| 2469 |
|
| 2470 |
needs_scaling = False
|
| 2471 |
crop_image = image # Default to preprocessed image
|
|
|
|
| 2493 |
else:
|
| 2494 |
# PaddleOCR processed the preprocessed image, so scale coordinates to preprocessed space
|
| 2495 |
needs_scaling = True
|
| 2496 |
+
elif self.ocr_engine == "tesseract":
|
| 2497 |
+
# For Tesseract: if scale_factor != 1.0, rescale_ocr_data converted coordinates to original space
|
| 2498 |
+
# So we need to crop from the original image, not the preprocessed image
|
| 2499 |
+
if scale_factor != 1.0 and original_image_for_visualization is not None:
|
| 2500 |
+
# Coordinates are in original space, so crop from original image
|
| 2501 |
+
crop_image = original_image_for_visualization
|
| 2502 |
+
crop_image_width = original_image_width
|
| 2503 |
+
crop_image_height = original_image_height
|
| 2504 |
+
needs_scaling = False
|
| 2505 |
+
else:
|
| 2506 |
+
# scale_factor == 1.0, so coordinates are still in preprocessed space
|
| 2507 |
+
# Crop from preprocessed image - no scaling needed
|
| 2508 |
+
needs_scaling = False
|
| 2509 |
|
| 2510 |
if needs_scaling:
|
| 2511 |
# Calculate scale factors from original to preprocessed
|
|
|
|
| 2589 |
def get_model(idx):
|
| 2590 |
return default_model
|
| 2591 |
|
| 2592 |
+
|
| 2593 |
+
output = [
|
| 2594 |
OCRResult(
|
| 2595 |
text=clean_unicode_text(ocr_result["text"][i]),
|
| 2596 |
left=ocr_result["left"][i],
|
|
|
|
| 2599 |
height=ocr_result["height"][i],
|
| 2600 |
conf=round(float(ocr_result["conf"][i]), 0),
|
| 2601 |
model=get_model(i),
|
|
|
|
| 2602 |
)
|
| 2603 |
for i in valid_indices
|
| 2604 |
]
|
| 2605 |
|
| 2606 |
+
return output
|
| 2607 |
+
|
| 2608 |
def analyze_text(
|
| 2609 |
self,
|
| 2610 |
line_level_ocr_results: List[OCRResult],
|
tools/word_segmenter.py
CHANGED
|
@@ -4,7 +4,7 @@ from typing import Dict, List, Tuple
|
|
| 4 |
import cv2
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
-
from tools.config import OUTPUT_FOLDER
|
| 8 |
|
| 9 |
INITIAL_KERNEL_WIDTH_FACTOR = 0.05 # Default 0.05
|
| 10 |
INITIAL_VALLEY_THRESHOLD_FACTOR = 0.05 # Default 0.05
|
|
@@ -15,7 +15,6 @@ MIN_SPACE_FACTOR = 0.3 # Default 0.4
|
|
| 15 |
MATCH_TOLERANCE = 0 # Default 0
|
| 16 |
MIN_AREA_THRESHOLD = 6 # Default 6
|
| 17 |
DEFAULT_TRIM_PERCENTAGE = 0.2 # Default 0.2
|
| 18 |
-
SHOW_OUTPUT_IMAGES = False # Default False
|
| 19 |
|
| 20 |
|
| 21 |
class AdaptiveSegmenter:
|
|
@@ -291,7 +290,7 @@ class AdaptiveSegmenter:
|
|
| 291 |
# print(f"line_text: {line_text}")
|
| 292 |
shortened_line_text = line_text.replace(" ", "_")[:10]
|
| 293 |
|
| 294 |
-
if
|
| 295 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 296 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_original.png"
|
| 297 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
@@ -346,7 +345,7 @@ class AdaptiveSegmenter:
|
|
| 346 |
return ({}, False)
|
| 347 |
|
| 348 |
# Save deskewed image (optional, only if image_name is provided)
|
| 349 |
-
if
|
| 350 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 351 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_deskewed.png"
|
| 352 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
@@ -402,7 +401,7 @@ class AdaptiveSegmenter:
|
|
| 402 |
return ({}, False)
|
| 403 |
|
| 404 |
# Save cropped image (optional, only if image_name is provided)
|
| 405 |
-
if
|
| 406 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 407 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_binary.png"
|
| 408 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
@@ -436,7 +435,7 @@ class AdaptiveSegmenter:
|
|
| 436 |
# dilated_binary = cv2.dilate(closed_binary, kernel, iterations=1)
|
| 437 |
# Use 'closed_binary' (or 'dilated_binary') from now on.
|
| 438 |
|
| 439 |
-
if
|
| 440 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 441 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_closed_binary.png"
|
| 442 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
@@ -633,7 +632,7 @@ class AdaptiveSegmenter:
|
|
| 633 |
# print(f"Target word count: {target_word_count}")
|
| 634 |
|
| 635 |
# Save cropped image (optional, only if image_name is provided)
|
| 636 |
-
if
|
| 637 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 638 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_clean_binary.png"
|
| 639 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
@@ -898,7 +897,7 @@ class AdaptiveSegmenter:
|
|
| 898 |
remapped_output[key].append(box[key])
|
| 899 |
|
| 900 |
# Visualisation
|
| 901 |
-
if
|
| 902 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_final_boxes.png"
|
| 903 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
| 904 |
output_image_vis = line_image.copy()
|
|
|
|
| 4 |
import cv2
|
| 5 |
import numpy as np
|
| 6 |
|
| 7 |
+
from tools.config import OUTPUT_FOLDER, SAVE_WORD_SEGMENTER_OUTPUT_IMAGES
|
| 8 |
|
| 9 |
INITIAL_KERNEL_WIDTH_FACTOR = 0.05 # Default 0.05
|
| 10 |
INITIAL_VALLEY_THRESHOLD_FACTOR = 0.05 # Default 0.05
|
|
|
|
| 15 |
MATCH_TOLERANCE = 0 # Default 0
|
| 16 |
MIN_AREA_THRESHOLD = 6 # Default 6
|
| 17 |
DEFAULT_TRIM_PERCENTAGE = 0.2 # Default 0.2
|
|
|
|
| 18 |
|
| 19 |
|
| 20 |
class AdaptiveSegmenter:
|
|
|
|
| 290 |
# print(f"line_text: {line_text}")
|
| 291 |
shortened_line_text = line_text.replace(" ", "_")[:10]
|
| 292 |
|
| 293 |
+
if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
|
| 294 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 295 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_original.png"
|
| 296 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
|
|
| 345 |
return ({}, False)
|
| 346 |
|
| 347 |
# Save deskewed image (optional, only if image_name is provided)
|
| 348 |
+
if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
|
| 349 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 350 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_deskewed.png"
|
| 351 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
|
|
| 401 |
return ({}, False)
|
| 402 |
|
| 403 |
# Save cropped image (optional, only if image_name is provided)
|
| 404 |
+
if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
|
| 405 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 406 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_binary.png"
|
| 407 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
|
|
| 435 |
# dilated_binary = cv2.dilate(closed_binary, kernel, iterations=1)
|
| 436 |
# Use 'closed_binary' (or 'dilated_binary') from now on.
|
| 437 |
|
| 438 |
+
if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
|
| 439 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 440 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_closed_binary.png"
|
| 441 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
|
|
| 632 |
# print(f"Target word count: {target_word_count}")
|
| 633 |
|
| 634 |
# Save cropped image (optional, only if image_name is provided)
|
| 635 |
+
if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
|
| 636 |
os.makedirs(self.output_folder, exist_ok=True)
|
| 637 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_clean_binary.png"
|
| 638 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
|
|
|
| 897 |
remapped_output[key].append(box[key])
|
| 898 |
|
| 899 |
# Visualisation
|
| 900 |
+
if SAVE_WORD_SEGMENTER_OUTPUT_IMAGES:
|
| 901 |
output_path = f"{self.output_folder}/word_segmentation/{image_name}_{shortened_line_text}_final_boxes.png"
|
| 902 |
os.makedirs(f"{self.output_folder}/word_segmentation", exist_ok=True)
|
| 903 |
output_image_vis = line_image.copy()
|