Commit
·
5e01004
1
Parent(s):
2f34683
Initial commit for VLM support. Created visualisations for OCR output. Corrected log_file_output_paths reference.
Browse files- .dockerignore +1 -0
- .gitignore +1 -0
- Dockerfile +8 -3
- README.md +2 -2
- app.py +25 -21
- cli_redact.py +1 -1
- requirements.txt +12 -4
- requirements_lightweight.txt +38 -0
- src/app_settings.qmd +5 -5
- src/user_guide.qmd +1 -1
- tools/config.py +73 -5
- tools/custom_image_analyser_engine.py +638 -124
- tools/file_redaction.py +446 -18
- tools/run_vlm.py +211 -0
.dockerignore
CHANGED
|
@@ -34,3 +34,4 @@ test/output/*
|
|
| 34 |
test/tmp/*
|
| 35 |
test/usage/*
|
| 36 |
.ruff_cache/*
|
|
|
|
|
|
| 34 |
test/tmp/*
|
| 35 |
test/usage/*
|
| 36 |
.ruff_cache/*
|
| 37 |
+
model_cache/*
|
.gitignore
CHANGED
|
@@ -37,3 +37,4 @@ test/output/*
|
|
| 37 |
test/tmp/*
|
| 38 |
test/usage/*
|
| 39 |
.ruff_cache/*
|
|
|
|
|
|
| 37 |
test/tmp/*
|
| 38 |
test/usage/*
|
| 39 |
.ruff_cache/*
|
| 40 |
+
model_cache/*
|
Dockerfile
CHANGED
|
@@ -16,11 +16,11 @@ RUN apt-get update \
|
|
| 16 |
|
| 17 |
WORKDIR /src
|
| 18 |
|
| 19 |
-
COPY
|
| 20 |
|
| 21 |
-
RUN pip install --verbose --no-cache-dir --target=/install -r
|
| 22 |
|
| 23 |
-
# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. See
|
| 24 |
ARG INSTALL_PADDLEOCR=False
|
| 25 |
ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
|
| 26 |
|
|
@@ -28,6 +28,11 @@ RUN if [ "$INSTALL_PADDLEOCR" = "True" ]; then \
|
|
| 28 |
pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0 paddlepaddle==3.2.0; \
|
| 29 |
fi
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
# ===================================================================
|
| 32 |
# Stage 2: A common 'base' for both Lambda and Gradio
|
| 33 |
# ===================================================================
|
|
|
|
| 16 |
|
| 17 |
WORKDIR /src
|
| 18 |
|
| 19 |
+
COPY requirements_lightweight.txt .
|
| 20 |
|
| 21 |
+
RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
|
| 22 |
|
| 23 |
+
# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. See requirements_lightweight.txt for more details, including installing the GPU version of PaddleOCR.
|
| 24 |
ARG INSTALL_PADDLEOCR=False
|
| 25 |
ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
|
| 26 |
|
|
|
|
| 28 |
pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0 paddlepaddle==3.2.0; \
|
| 29 |
fi
|
| 30 |
|
| 31 |
+
RUN if [ "$INSTALL_VLM" = "True" ]; then \
|
| 32 |
+
pip install --verbose --no-cache-dir --target=/install torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/cu126; \
|
| 33 |
+
pip install --verbose --no-cache-dir --target=/install transformers==4.57.1 accelerate==1.11.0 bitsandbytes==0.48.1; \
|
| 34 |
+
fi
|
| 35 |
+
|
| 36 |
# ===================================================================
|
| 37 |
# Stage 2: A common 'base' for both Lambda and Gradio
|
| 38 |
# ===================================================================
|
README.md
CHANGED
|
@@ -162,7 +162,7 @@ These settings are useful for all users, regardless of whether you are using AWS
|
|
| 162 |
* Set to `True` to display a language selection dropdown in the UI for OCR processing.
|
| 163 |
|
| 164 |
* `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
|
| 165 |
-
* Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
|
| 166 |
|
| 167 |
* `SESSION_OUTPUT_FOLDER=False`
|
| 168 |
* If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
|
|
@@ -922,7 +922,7 @@ The hybrid OCR mode uses several configurable parameters:
|
|
| 922 |
|
| 923 |
- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
|
| 924 |
- **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
|
| 925 |
-
- **
|
| 926 |
- **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
|
| 927 |
|
| 928 |
### When to use different OCR models
|
|
|
|
| 162 |
* Set to `True` to display a language selection dropdown in the UI for OCR processing.
|
| 163 |
|
| 164 |
* `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
|
| 165 |
+
* Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
|
| 166 |
|
| 167 |
* `SESSION_OUTPUT_FOLDER=False`
|
| 168 |
* If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
|
|
|
|
| 922 |
|
| 923 |
- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
|
| 924 |
- **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
|
| 925 |
+
- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
|
| 926 |
- **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
|
| 927 |
|
| 928 |
### When to use different OCR models
|
app.py
CHANGED
|
@@ -1242,16 +1242,7 @@ with blocks:
|
|
| 1242 |
label=f"Change default redaction settings.{default_text}{textract_text}{comprehend_text}{open_tab_text}".strip(),
|
| 1243 |
open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
|
| 1244 |
):
|
| 1245 |
-
text_extract_method_radio.render()
|
| 1246 |
-
|
| 1247 |
-
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
|
| 1248 |
-
with gr.Accordion(
|
| 1249 |
-
"Enable AWS Textract signature detection (default is off)",
|
| 1250 |
-
open=False,
|
| 1251 |
-
):
|
| 1252 |
-
handwrite_signature_checkbox.render()
|
| 1253 |
-
else:
|
| 1254 |
-
handwrite_signature_checkbox.render()
|
| 1255 |
|
| 1256 |
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 1257 |
with gr.Accordion(
|
|
@@ -1259,7 +1250,7 @@ with blocks:
|
|
| 1259 |
open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
|
| 1260 |
):
|
| 1261 |
local_ocr_method_radio = gr.Radio(
|
| 1262 |
-
label="""Choose local OCR model. "tesseract" is the default and will work for most documents. "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.""",
|
| 1263 |
value=CHOSEN_LOCAL_OCR_MODEL,
|
| 1264 |
choices=LOCAL_OCR_MODEL_OPTIONS,
|
| 1265 |
interactive=True,
|
|
@@ -1274,6 +1265,15 @@ with blocks:
|
|
| 1274 |
visible=False,
|
| 1275 |
)
|
| 1276 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1277 |
with gr.Row(equal_height=True):
|
| 1278 |
pii_identification_method_drop.render()
|
| 1279 |
|
|
@@ -1378,16 +1378,20 @@ with blocks:
|
|
| 1378 |
with gr.Row(equal_height=False):
|
| 1379 |
with gr.Column(scale=2):
|
| 1380 |
textract_job_detail_df = gr.Dataframe(
|
| 1381 |
-
|
| 1382 |
-
|
| 1383 |
-
|
| 1384 |
-
|
| 1385 |
-
|
| 1386 |
-
|
| 1387 |
-
|
| 1388 |
-
|
| 1389 |
-
|
| 1390 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1391 |
with gr.Column(scale=1):
|
| 1392 |
job_id_textbox = gr.Textbox(
|
| 1393 |
label="Job ID to check status",
|
|
|
|
| 1242 |
label=f"Change default redaction settings.{default_text}{textract_text}{comprehend_text}{open_tab_text}".strip(),
|
| 1243 |
open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
|
| 1244 |
):
|
| 1245 |
+
text_extract_method_radio.render()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1246 |
|
| 1247 |
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 1248 |
with gr.Accordion(
|
|
|
|
| 1250 |
open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
|
| 1251 |
):
|
| 1252 |
local_ocr_method_radio = gr.Radio(
|
| 1253 |
+
label="""Choose local OCR model. "tesseract" is the default and will work for most documents. "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence. "hybrid-vlm" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen vision model (default Dots.OCR) on words with low confidence.""",
|
| 1254 |
value=CHOSEN_LOCAL_OCR_MODEL,
|
| 1255 |
choices=LOCAL_OCR_MODEL_OPTIONS,
|
| 1256 |
interactive=True,
|
|
|
|
| 1265 |
visible=False,
|
| 1266 |
)
|
| 1267 |
|
| 1268 |
+
if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
|
| 1269 |
+
with gr.Accordion(
|
| 1270 |
+
"Enable AWS Textract signature detection (default is off)",
|
| 1271 |
+
open=False,
|
| 1272 |
+
):
|
| 1273 |
+
handwrite_signature_checkbox.render()
|
| 1274 |
+
else:
|
| 1275 |
+
handwrite_signature_checkbox.render()
|
| 1276 |
+
|
| 1277 |
with gr.Row(equal_height=True):
|
| 1278 |
pii_identification_method_drop.render()
|
| 1279 |
|
|
|
|
| 1378 |
with gr.Row(equal_height=False):
|
| 1379 |
with gr.Column(scale=2):
|
| 1380 |
textract_job_detail_df = gr.Dataframe(
|
| 1381 |
+
pd.DataFrame(
|
| 1382 |
+
columns=[
|
| 1383 |
+
"job_id",
|
| 1384 |
+
"file_name",
|
| 1385 |
+
"job_type",
|
| 1386 |
+
"signature_extraction",
|
| 1387 |
+
"job_date_time",
|
| 1388 |
+
]
|
| 1389 |
+
),
|
| 1390 |
+
label="Previous job details",
|
| 1391 |
+
visible=True,
|
| 1392 |
+
type="pandas",
|
| 1393 |
+
wrap=True,
|
| 1394 |
+
)
|
| 1395 |
with gr.Column(scale=1):
|
| 1396 |
job_id_textbox = gr.Textbox(
|
| 1397 |
label="Job ID to check status",
|
cli_redact.py
CHANGED
|
@@ -399,7 +399,7 @@ python cli_redact.py --task textract --textract_action list
|
|
| 399 |
)
|
| 400 |
pdf_group.add_argument(
|
| 401 |
"--chosen_local_ocr_model",
|
| 402 |
-
choices=["tesseract", "hybrid", "paddle"],
|
| 403 |
default=CHOSEN_LOCAL_OCR_MODEL,
|
| 404 |
help="Local OCR model to use.",
|
| 405 |
)
|
|
|
|
| 399 |
)
|
| 400 |
pdf_group.add_argument(
|
| 401 |
"--chosen_local_ocr_model",
|
| 402 |
+
choices=["tesseract", "hybrid-paddle", "paddle"],
|
| 403 |
default=CHOSEN_LOCAL_OCR_MODEL,
|
| 404 |
help="Local OCR model to use.",
|
| 405 |
)
|
requirements.txt
CHANGED
|
@@ -24,13 +24,21 @@ python-dotenv==1.0.1
|
|
| 24 |
awslambdaric==3.1.1
|
| 25 |
python-docx==1.2.0
|
| 26 |
defusedxml==0.7.1
|
| 27 |
-
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
| 28 |
-
# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
| 29 |
-
# paddleocr==3.3.0
|
| 30 |
-
|
| 31 |
# Test dependencies
|
| 32 |
pytest>=7.0.0
|
| 33 |
pytest-cov>=4.0.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
|
| 35 |
|
| 36 |
|
|
|
|
| 24 |
awslambdaric==3.1.1
|
| 25 |
python-docx==1.2.0
|
| 26 |
defusedxml==0.7.1
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Test dependencies
|
| 28 |
pytest>=7.0.0
|
| 29 |
pytest-cov>=4.0.0
|
| 30 |
+
spaces==0.42.1
|
| 31 |
+
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
| 32 |
+
# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
| 33 |
+
# paddleocr==3.3.0
|
| 34 |
+
For running VLMs
|
| 35 |
+
torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/cu126
|
| 36 |
+
transformers==4.57.1
|
| 37 |
+
accelerate==1.11.0
|
| 38 |
+
bitsandbytes==0.48.1
|
| 39 |
+
flash-attn==2.8.3 # Only compatible with Linux systems
|
| 40 |
+
|
| 41 |
+
|
| 42 |
|
| 43 |
|
| 44 |
|
requirements_lightweight.txt
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pdfminer.six==20250506
|
| 2 |
+
pdf2image==1.17.0
|
| 3 |
+
pymupdf==1.26.4
|
| 4 |
+
opencv-python==4.12.0.88
|
| 5 |
+
presidio_analyzer==2.2.360
|
| 6 |
+
presidio_anonymizer==2.2.360
|
| 7 |
+
presidio-image-redactor==0.0.57
|
| 8 |
+
pikepdf==9.11.0
|
| 9 |
+
pandas==2.3.3
|
| 10 |
+
scikit-learn==1.7.2
|
| 11 |
+
spacy==3.8.7
|
| 12 |
+
en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
|
| 13 |
+
gradio==5.49.1
|
| 14 |
+
polars==1.33.1
|
| 15 |
+
boto3==1.40.57
|
| 16 |
+
pyarrow==21.0.0
|
| 17 |
+
openpyxl==3.1.5
|
| 18 |
+
Faker==37.8.0
|
| 19 |
+
python-levenshtein==0.27.1
|
| 20 |
+
spaczz==0.6.1
|
| 21 |
+
https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
|
| 22 |
+
rapidfuzz==3.14.1
|
| 23 |
+
python-dotenv==1.0.1
|
| 24 |
+
awslambdaric==3.1.1
|
| 25 |
+
python-docx==1.2.0
|
| 26 |
+
defusedxml==0.7.1
|
| 27 |
+
# Test dependencies
|
| 28 |
+
pytest>=7.0.0
|
| 29 |
+
pytest-cov>=4.0.0
|
| 30 |
+
spaces==0.42.1
|
| 31 |
+
# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
|
| 32 |
+
# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
|
| 33 |
+
# paddleocr==3.3.0
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
|
src/app_settings.qmd
CHANGED
|
@@ -300,7 +300,7 @@ Configurations related to text extraction, PII detection, and the redaction proc
|
|
| 300 |
### Local OCR (Tesseract & PaddleOCR)
|
| 301 |
|
| 302 |
* **`CHOSEN_LOCAL_OCR_MODEL`**
|
| 303 |
-
* **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid"`.
|
| 304 |
* **Default Value:** `"tesseract"`
|
| 305 |
|
| 306 |
* **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
|
|
@@ -308,11 +308,11 @@ Configurations related to text extraction, PII detection, and the redaction proc
|
|
| 308 |
* **Default Value:** `"False"`
|
| 309 |
|
| 310 |
* **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
|
| 311 |
-
* **Description:** In "hybrid" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
|
| 312 |
* **Default Value:** `65`
|
| 313 |
|
| 314 |
* **`HYBRID_OCR_PADDING`**
|
| 315 |
-
* **Description:** In "hybrid" mode, padding added to the word's bounding box before re-extraction.
|
| 316 |
* **Default Value:** `1`
|
| 317 |
|
| 318 |
* **`PADDLE_USE_TEXTLINE_ORIENTATION`**
|
|
@@ -323,8 +323,8 @@ Configurations related to text extraction, PII detection, and the redaction proc
|
|
| 323 |
* **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
|
| 324 |
* **Default Value:** `1.2`
|
| 325 |
|
| 326 |
-
* **`
|
| 327 |
-
* **Description:** Saves comparison images when using "hybrid" OCR mode.
|
| 328 |
* **Default Value:** `"False"`
|
| 329 |
|
| 330 |
* **`SAVE_PADDLE_VISUALISATIONS`**
|
|
|
|
| 300 |
### Local OCR (Tesseract & PaddleOCR)
|
| 301 |
|
| 302 |
* **`CHOSEN_LOCAL_OCR_MODEL`**
|
| 303 |
+
* **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid-paddle"`.
|
| 304 |
* **Default Value:** `"tesseract"`
|
| 305 |
|
| 306 |
* **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
|
|
|
|
| 308 |
* **Default Value:** `"False"`
|
| 309 |
|
| 310 |
* **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
|
| 311 |
+
* **Description:** In "hybrid-paddle" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
|
| 312 |
* **Default Value:** `65`
|
| 313 |
|
| 314 |
* **`HYBRID_OCR_PADDING`**
|
| 315 |
+
* **Description:** In "hybrid-paddle" mode, padding added to the word's bounding box before re-extraction.
|
| 316 |
* **Default Value:** `1`
|
| 317 |
|
| 318 |
* **`PADDLE_USE_TEXTLINE_ORIENTATION`**
|
|
|
|
| 323 |
* **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
|
| 324 |
* **Default Value:** `1.2`
|
| 325 |
|
| 326 |
+
* **`SAVE_EXAMPLE_HYBRID_IMAGES`**
|
| 327 |
+
* **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
|
| 328 |
* **Default Value:** `"False"`
|
| 329 |
|
| 330 |
* **`SAVE_PADDLE_VISUALISATIONS`**
|
src/user_guide.qmd
CHANGED
|
@@ -721,7 +721,7 @@ The hybrid OCR mode uses several configurable parameters:
|
|
| 721 |
|
| 722 |
- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
|
| 723 |
- **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
|
| 724 |
-
- **
|
| 725 |
- **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
|
| 726 |
|
| 727 |
### When to use different OCR models
|
|
|
|
| 721 |
|
| 722 |
- **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
|
| 723 |
- **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
|
| 724 |
+
- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
|
| 725 |
- **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
|
| 726 |
|
| 727 |
### When to use different OCR models
|
tools/config.py
CHANGED
|
@@ -437,10 +437,54 @@ DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var(
|
|
| 437 |
"DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
|
| 438 |
)
|
| 439 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 440 |
### Local OCR model - Tesseract vs PaddleOCR
|
| 441 |
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
|
| 442 |
"CHOSEN_LOCAL_OCR_MODEL", "tesseract"
|
| 443 |
-
) # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
|
| 444 |
|
| 445 |
SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
|
| 446 |
get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
|
|
@@ -448,12 +492,19 @@ SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
|
|
| 448 |
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 449 |
LOCAL_OCR_MODEL_OPTIONS = [
|
| 450 |
"tesseract",
|
| 451 |
-
"hybrid",
|
| 452 |
"paddle",
|
| 453 |
]
|
| 454 |
else:
|
| 455 |
LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
|
| 456 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 457 |
HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
|
| 458 |
get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
|
| 459 |
) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
|
|
@@ -461,6 +512,14 @@ HYBRID_OCR_PADDING = int(
|
|
| 461 |
get_or_create_env_var("HYBRID_OCR_PADDING", "1")
|
| 462 |
) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
|
| 463 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
|
| 465 |
get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
|
| 466 |
)
|
|
@@ -469,14 +528,22 @@ PADDLE_DET_DB_UNCLIP_RATIO = float(
|
|
| 469 |
get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
|
| 470 |
)
|
| 471 |
|
| 472 |
-
|
| 473 |
-
get_or_create_env_var("
|
| 474 |
) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
|
| 475 |
|
| 476 |
SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
|
| 477 |
get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
|
| 478 |
) # Whether to save visualisations of PaddleOCR bounding boxes.
|
| 479 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 480 |
# Model storage paths for Lambda compatibility
|
| 481 |
PADDLE_MODEL_PATH = get_or_create_env_var(
|
| 482 |
"PADDLE_MODEL_PATH", ""
|
|
@@ -487,7 +554,7 @@ SPACY_MODEL_PATH = get_or_create_env_var(
|
|
| 487 |
) # Directory for spaCy model storage. Uses default location if not set.
|
| 488 |
|
| 489 |
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
|
| 490 |
-
"PREPROCESS_LOCAL_OCR_IMAGES", "
|
| 491 |
) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
|
| 492 |
|
| 493 |
# Entities for redaction
|
|
@@ -1012,6 +1079,7 @@ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
|
|
| 1012 |
) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
|
| 1013 |
|
| 1014 |
|
|
|
|
| 1015 |
###
|
| 1016 |
# Config vars output format
|
| 1017 |
###
|
|
|
|
| 437 |
"DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
|
| 438 |
)
|
| 439 |
|
| 440 |
+
###
|
| 441 |
+
# LOCAL OCR MODEL OPTIONS
|
| 442 |
+
###
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
### VLM OPTIONS
|
| 446 |
+
|
| 447 |
+
SHOW_VLM_MODEL_OPTIONS = convert_string_to_boolean(
|
| 448 |
+
get_or_create_env_var("SHOW_VLM_MODEL_OPTIONS", "False")
|
| 449 |
+
) # Whether to show the VLM model options in the UI
|
| 450 |
+
|
| 451 |
+
SELECTED_MODEL = get_or_create_env_var(
|
| 452 |
+
"SELECTED_MODEL", "Dots.OCR"
|
| 453 |
+
) # Selected vision model. Choose from: "olmOCR-2-7B-1025", "Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR"
|
| 454 |
+
|
| 455 |
+
if SHOW_VLM_MODEL_OPTIONS:
|
| 456 |
+
VLM_MODEL_OPTIONS = [
|
| 457 |
+
SELECTED_MODEL,
|
| 458 |
+
]
|
| 459 |
+
|
| 460 |
+
MAX_SPACES_GPU_RUN_TIME = int(
|
| 461 |
+
get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "60")
|
| 462 |
+
) # Maximum number of seconds to run the GPU on Spaces
|
| 463 |
+
|
| 464 |
+
MAX_NEW_TOKENS = int(
|
| 465 |
+
get_or_create_env_var("MAX_NEW_TOKENS", "30")
|
| 466 |
+
) # Maximum number of tokens to generate
|
| 467 |
+
|
| 468 |
+
DEFAULT_MAX_NEW_TOKENS = int(
|
| 469 |
+
get_or_create_env_var("DEFAULT_MAX_NEW_TOKENS", "30")
|
| 470 |
+
) # Default maximum number of tokens to generate
|
| 471 |
+
|
| 472 |
+
MAX_INPUT_TOKEN_LENGTH = int(
|
| 473 |
+
get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
|
| 474 |
+
) # Maximum number of tokens to input to the VLM
|
| 475 |
+
|
| 476 |
+
USE_FLASH_ATTENTION = convert_string_to_boolean(
|
| 477 |
+
get_or_create_env_var("USE_FLASH_ATTENTION", "False")
|
| 478 |
+
) # Whether to use flash attention for the VLM
|
| 479 |
+
|
| 480 |
+
OVERWRITE_EXISTING_OCR_RESULTS = convert_string_to_boolean(
|
| 481 |
+
get_or_create_env_var("OVERWRITE_EXISTING_OCR_RESULTS", "False")
|
| 482 |
+
) # If True, always create new OCR results instead of loading from existing JSON files
|
| 483 |
+
|
| 484 |
### Local OCR model - Tesseract vs PaddleOCR
|
| 485 |
CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
|
| 486 |
"CHOSEN_LOCAL_OCR_MODEL", "tesseract"
|
| 487 |
+
) # Choose between "tesseract", "hybrid-paddle", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
|
| 488 |
|
| 489 |
SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
|
| 490 |
get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
|
|
|
|
| 492 |
if SHOW_LOCAL_OCR_MODEL_OPTIONS:
|
| 493 |
LOCAL_OCR_MODEL_OPTIONS = [
|
| 494 |
"tesseract",
|
| 495 |
+
"hybrid-paddle",
|
| 496 |
"paddle",
|
| 497 |
]
|
| 498 |
else:
|
| 499 |
LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
|
| 500 |
|
| 501 |
+
vlm_options = ["hybrid-vlm"]
|
| 502 |
+
if SHOW_VLM_MODEL_OPTIONS:
|
| 503 |
+
LOCAL_OCR_MODEL_OPTIONS.extend(vlm_options)
|
| 504 |
+
|
| 505 |
+
MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache")
|
| 506 |
+
|
| 507 |
+
|
| 508 |
HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
|
| 509 |
get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
|
| 510 |
) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
|
|
|
|
| 512 |
get_or_create_env_var("HYBRID_OCR_PADDING", "1")
|
| 513 |
) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
|
| 514 |
|
| 515 |
+
TESSERACT_SEGMENTATION_LEVEL = get_or_create_env_var(
|
| 516 |
+
"TESSERACT_SEGMENTATION_LEVEL", "word"
|
| 517 |
+
) # Tesseract segmentation level: "word" (PSM 11) or "line" (PSM 6)
|
| 518 |
+
|
| 519 |
+
CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean(
|
| 520 |
+
get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "False")
|
| 521 |
+
) # Whether to convert line-level OCR results to word-level for better precision
|
| 522 |
+
|
| 523 |
PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
|
| 524 |
get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
|
| 525 |
)
|
|
|
|
| 528 |
get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
|
| 529 |
)
|
| 530 |
|
| 531 |
+
SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean(
|
| 532 |
+
get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
|
| 533 |
) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
|
| 534 |
|
| 535 |
SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
|
| 536 |
get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
|
| 537 |
) # Whether to save visualisations of PaddleOCR bounding boxes.
|
| 538 |
|
| 539 |
+
SAVE_TESSERACT_VISUALISATIONS = convert_string_to_boolean(
|
| 540 |
+
get_or_create_env_var("SAVE_TESSERACT_VISUALISATIONS", "False")
|
| 541 |
+
) # Whether to save visualisations of Tesseract bounding boxes.
|
| 542 |
+
|
| 543 |
+
SAVE_TEXTRACT_VISUALISATIONS = convert_string_to_boolean(
|
| 544 |
+
get_or_create_env_var("SAVE_TEXTRACT_VISUALISATIONS", "False")
|
| 545 |
+
) # Whether to save visualisations of AWS Textract bounding boxes.
|
| 546 |
+
|
| 547 |
# Model storage paths for Lambda compatibility
|
| 548 |
PADDLE_MODEL_PATH = get_or_create_env_var(
|
| 549 |
"PADDLE_MODEL_PATH", ""
|
|
|
|
| 554 |
) # Directory for spaCy model storage. Uses default location if not set.
|
| 555 |
|
| 556 |
PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
|
| 557 |
+
"PREPROCESS_LOCAL_OCR_IMAGES", "True"
|
| 558 |
) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
|
| 559 |
|
| 560 |
# Entities for redaction
|
|
|
|
| 1079 |
) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
|
| 1080 |
|
| 1081 |
|
| 1082 |
+
|
| 1083 |
###
|
| 1084 |
# Config vars output format
|
| 1085 |
###
|
tools/custom_image_analyser_engine.py
CHANGED
|
@@ -17,22 +17,28 @@ from presidio_analyzer import AnalyzerEngine, RecognizerResult
|
|
| 17 |
|
| 18 |
from tools.config import (
|
| 19 |
AWS_PII_OPTION,
|
|
|
|
| 20 |
DEFAULT_LANGUAGE,
|
| 21 |
HYBRID_OCR_CONFIDENCE_THRESHOLD,
|
| 22 |
HYBRID_OCR_PADDING,
|
| 23 |
LOCAL_OCR_MODEL_OPTIONS,
|
| 24 |
LOCAL_PII_OPTION,
|
|
|
|
| 25 |
OUTPUT_FOLDER,
|
| 26 |
PADDLE_DET_DB_UNCLIP_RATIO,
|
| 27 |
PADDLE_MODEL_PATH,
|
| 28 |
PADDLE_USE_TEXTLINE_ORIENTATION,
|
| 29 |
PREPROCESS_LOCAL_OCR_IMAGES,
|
| 30 |
-
|
| 31 |
SAVE_PADDLE_VISUALISATIONS,
|
|
|
|
|
|
|
|
|
|
| 32 |
)
|
| 33 |
from tools.helper_functions import clean_unicode_text
|
| 34 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 35 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
|
|
|
| 36 |
from tools.secure_path_utils import validate_folder_containment
|
| 37 |
from tools.secure_regex_utils import safe_sanitize_text
|
| 38 |
|
|
@@ -177,6 +183,7 @@ class OCRResult:
|
|
| 177 |
height: int
|
| 178 |
conf: float = None
|
| 179 |
line: int = None
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
@dataclass
|
|
@@ -368,30 +375,88 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
|
| 368 |
adjusted_contrast = contrast
|
| 369 |
return adjusted_image, contrast, adjusted_contrast
|
| 370 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 371 |
def preprocess_image(
|
| 372 |
-
self,
|
|
|
|
|
|
|
|
|
|
| 373 |
) -> Tuple[Image.Image, dict]:
|
| 374 |
"""
|
| 375 |
-
A
|
| 376 |
-
Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
|
| 377 |
-
|
| 378 |
-
I have found that binarization is not always helpful with Tesseract, and can sometimes degrade results. So it is off by default.
|
| 379 |
"""
|
| 380 |
-
# 1. Convert to
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
|
| 385 |
-
|
| 386 |
)
|
| 387 |
|
| 388 |
-
#
|
|
|
|
|
|
|
|
|
|
| 389 |
filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
|
| 390 |
|
| 391 |
-
#
|
| 392 |
adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
|
| 393 |
|
| 394 |
-
#
|
| 395 |
if perform_binarization:
|
| 396 |
final_image_np, threshold_metadata = (
|
| 397 |
self.adaptive_threshold.preprocess_image(adjusted_image_np)
|
|
@@ -404,7 +469,8 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
|
|
| 404 |
final_metadata = {**scale_metadata, **threshold_metadata}
|
| 405 |
|
| 406 |
# Convert final numpy array back to PIL Image for return
|
| 407 |
-
|
|
|
|
| 408 |
|
| 409 |
|
| 410 |
def rescale_ocr_data(ocr_data, scale_factor: float):
|
|
@@ -447,10 +513,6 @@ def filter_entities_for_language(
|
|
| 447 |
print(f"No entities provided for language: {language}")
|
| 448 |
# raise Warning(f"No entities provided for language: {language}")
|
| 449 |
|
| 450 |
-
# print("entities:", entities)
|
| 451 |
-
# print("valid_language_entities:", valid_language_entities)
|
| 452 |
-
# print("language:", language)
|
| 453 |
-
|
| 454 |
filtered_entities = [
|
| 455 |
entity for entity in entities if entity in valid_language_entities
|
| 456 |
]
|
|
@@ -467,6 +529,75 @@ def filter_entities_for_language(
|
|
| 467 |
return filtered_entities
|
| 468 |
|
| 469 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
class CustomImageAnalyzerEngine:
|
| 471 |
def __init__(
|
| 472 |
self,
|
|
@@ -481,9 +612,9 @@ class CustomImageAnalyzerEngine:
|
|
| 481 |
"""
|
| 482 |
Initializes the CustomImageAnalyzerEngine.
|
| 483 |
|
| 484 |
-
:param ocr_engine: The OCR engine to use ("tesseract", "hybrid", or "paddle").
|
| 485 |
:param analyzer_engine: The Presidio AnalyzerEngine instance.
|
| 486 |
-
:param tesseract_config: Configuration string for Tesseract.
|
| 487 |
:param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
|
| 488 |
:param image_preprocessor: Optional image preprocessor.
|
| 489 |
:param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
|
|
@@ -511,7 +642,7 @@ class CustomImageAnalyzerEngine:
|
|
| 511 |
)
|
| 512 |
self.output_folder = normalized_output_folder
|
| 513 |
|
| 514 |
-
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
|
| 515 |
if PaddleOCR is None:
|
| 516 |
raise ImportError(
|
| 517 |
"paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle' in your python environment and retry."
|
|
@@ -538,22 +669,39 @@ class CustomImageAnalyzerEngine:
|
|
| 538 |
paddle_kwargs.setdefault("lang", self.paddle_lang)
|
| 539 |
self.paddle_ocr = PaddleOCR(**paddle_kwargs)
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
if not analyzer_engine:
|
| 542 |
analyzer_engine = AnalyzerEngine()
|
| 543 |
self.analyzer_engine = analyzer_engine
|
| 544 |
|
| 545 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
|
| 547 |
if not image_preprocessor:
|
| 548 |
image_preprocessor = ContrastSegmentedImageEnhancer()
|
| 549 |
self.image_preprocessor = image_preprocessor
|
| 550 |
|
| 551 |
-
def _sanitize_filename(
|
|
|
|
|
|
|
| 552 |
"""
|
| 553 |
Sanitizes text for use in filenames by removing invalid characters and limiting length.
|
| 554 |
|
| 555 |
:param text: The text to sanitize
|
| 556 |
:param max_length: Maximum length of the sanitized text
|
|
|
|
| 557 |
:return: Sanitized text safe for filenames
|
| 558 |
"""
|
| 559 |
|
|
@@ -568,7 +716,7 @@ class CustomImageAnalyzerEngine:
|
|
| 568 |
|
| 569 |
# If empty after sanitization, use a default value
|
| 570 |
if not sanitized:
|
| 571 |
-
sanitized =
|
| 572 |
|
| 573 |
# Limit to max_length characters
|
| 574 |
if len(sanitized) > max_length:
|
|
@@ -576,8 +724,139 @@ class CustomImageAnalyzerEngine:
|
|
| 576 |
# Ensure we don't end with an underscore if we cut in the middle
|
| 577 |
sanitized = sanitized.rstrip("_")
|
| 578 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
return sanitized
|
| 580 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 581 |
def _convert_paddle_to_tesseract_format(
|
| 582 |
self, paddle_results: List[Any]
|
| 583 |
) -> Dict[str, List]:
|
|
@@ -621,34 +900,207 @@ class CustomImageAnalyzerEngine:
|
|
| 621 |
line_width = float(max(x_coords) - line_left)
|
| 622 |
line_height = float(max(y_coords) - line_top)
|
| 623 |
|
| 624 |
-
#
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
|
|
|
|
|
|
|
|
|
| 628 |
|
| 629 |
-
|
| 630 |
-
total_chars = len(line_text)
|
| 631 |
-
# Avoid division by zero for empty lines
|
| 632 |
-
avg_char_width = line_width / total_chars if total_chars > 0 else 0
|
| 633 |
|
| 634 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 635 |
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
output["top"].append(line_top)
|
| 643 |
-
output["width"].append(word_width)
|
| 644 |
-
output["height"].append(line_height)
|
| 645 |
-
# Use the line's confidence for each word derived from it
|
| 646 |
-
output["conf"].append(int(line_confidence * 100))
|
| 647 |
|
| 648 |
-
|
| 649 |
-
|
| 650 |
|
| 651 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 652 |
|
| 653 |
def _perform_hybrid_ocr(
|
| 654 |
self,
|
|
@@ -659,16 +1111,20 @@ class CustomImageAnalyzerEngine:
|
|
| 659 |
image_name: str = "unknown_image_name",
|
| 660 |
) -> Dict[str, list]:
|
| 661 |
"""
|
| 662 |
-
Performs OCR using Tesseract for bounding boxes and PaddleOCR for low-confidence text.
|
| 663 |
Returns data in the same dictionary format as pytesseract.image_to_data.
|
| 664 |
"""
|
| 665 |
-
if
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 672 |
|
| 673 |
print("Starting hybrid OCR process...")
|
| 674 |
|
|
@@ -687,6 +1143,7 @@ class CustomImageAnalyzerEngine:
|
|
| 687 |
"width": list(),
|
| 688 |
"height": list(),
|
| 689 |
"conf": list(),
|
|
|
|
| 690 |
}
|
| 691 |
|
| 692 |
num_words = len(tesseract_data["text"])
|
|
@@ -707,6 +1164,9 @@ class CustomImageAnalyzerEngine:
|
|
| 707 |
height = tesseract_data["height"][i]
|
| 708 |
# line_number = tesseract_data['abs_line_id'][i]
|
| 709 |
|
|
|
|
|
|
|
|
|
|
| 710 |
# If confidence is low, use PaddleOCR for a second opinion
|
| 711 |
if conf < confidence_threshold:
|
| 712 |
img_width, img_height = image.size
|
|
@@ -722,82 +1182,90 @@ class CustomImageAnalyzerEngine:
|
|
| 722 |
cropped_image = image.crop(
|
| 723 |
(crop_left, crop_top, crop_right, crop_bottom)
|
| 724 |
)
|
| 725 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 726 |
|
| 727 |
-
|
| 728 |
-
|
| 729 |
|
| 730 |
-
|
| 731 |
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
|
|
|
|
|
|
|
|
|
| 735 |
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
)
|
|
|
|
| 745 |
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
tess_vs_paddle_examples_folder = (
|
| 764 |
-
self.output_folder
|
| 765 |
-
+ f"/tess_vs_paddle_examples/{normalized_image_name}"
|
| 766 |
-
)
|
| 767 |
-
# Validate the constructed path is safe before creating directories
|
| 768 |
-
if not validate_folder_containment(
|
| 769 |
-
tess_vs_paddle_examples_folder, OUTPUT_FOLDER
|
| 770 |
-
):
|
| 771 |
-
raise ValueError(
|
| 772 |
-
f"Unsafe tess_vs_paddle_examples folder path: {tess_vs_paddle_examples_folder}"
|
| 773 |
-
)
|
| 774 |
-
|
| 775 |
-
if not os.path.exists(tess_vs_paddle_examples_folder):
|
| 776 |
-
os.makedirs(tess_vs_paddle_examples_folder)
|
| 777 |
-
output_image_path = (
|
| 778 |
-
tess_vs_paddle_examples_folder
|
| 779 |
-
+ f"/{safe_text}_conf_{conf}_to_{new_text}_conf_{new_conf}.png"
|
| 780 |
)
|
| 781 |
-
print(f"Saving example image to {output_image_path}")
|
| 782 |
-
cropped_image.save(output_image_path)
|
| 783 |
|
| 784 |
-
|
| 785 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 786 |
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
|
|
|
| 790 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 791 |
else:
|
| 792 |
-
|
| 793 |
print(
|
| 794 |
-
f" '{text}' (conf: {conf}) ->
|
| 795 |
)
|
| 796 |
-
text = ""
|
| 797 |
else:
|
| 798 |
-
#
|
|
|
|
| 799 |
print(
|
| 800 |
-
f" '{text}' (conf: {conf}) -> No text found by
|
| 801 |
)
|
| 802 |
text = ""
|
| 803 |
|
|
@@ -809,6 +1277,7 @@ class CustomImageAnalyzerEngine:
|
|
| 809 |
final_data["width"].append(width)
|
| 810 |
final_data["height"].append(height)
|
| 811 |
final_data["conf"].append(int(conf))
|
|
|
|
| 812 |
# final_data['line_number'].append(int(line_number))
|
| 813 |
|
| 814 |
return final_data
|
|
@@ -839,10 +1308,14 @@ class CustomImageAnalyzerEngine:
|
|
| 839 |
image_width, image_height = image.size
|
| 840 |
|
| 841 |
# Note: In testing I haven't seen that this necessarily improves results
|
| 842 |
-
if self.ocr_engine == "hybrid":
|
| 843 |
# Try hybrid with original image for cropping:
|
| 844 |
ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
|
| 845 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 846 |
elif self.ocr_engine == "tesseract":
|
| 847 |
|
| 848 |
ocr_data = pytesseract.image_to_data(
|
|
@@ -852,6 +1325,15 @@ class CustomImageAnalyzerEngine:
|
|
| 852 |
lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
|
| 853 |
)
|
| 854 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 855 |
elif self.ocr_engine == "paddle":
|
| 856 |
|
| 857 |
if ocr is None:
|
|
@@ -903,15 +1385,33 @@ class CustomImageAnalyzerEngine:
|
|
| 903 |
|
| 904 |
ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
|
| 905 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 906 |
else:
|
| 907 |
raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
|
| 908 |
|
| 909 |
-
if
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
|
| 913 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 914 |
ocr_data = rescale_ocr_data(ocr_data, scale_factor)
|
|
|
|
| 915 |
|
| 916 |
# The rest of your processing pipeline now works for both engines
|
| 917 |
ocr_result = ocr_data
|
|
@@ -923,6 +1423,20 @@ class CustomImageAnalyzerEngine:
|
|
| 923 |
if text.strip() and int(ocr_result["conf"][i]) > 0
|
| 924 |
]
|
| 925 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 926 |
return [
|
| 927 |
OCRResult(
|
| 928 |
text=clean_unicode_text(ocr_result["text"][i]),
|
|
@@ -931,6 +1445,7 @@ class CustomImageAnalyzerEngine:
|
|
| 931 |
width=ocr_result["width"][i],
|
| 932 |
height=ocr_result["height"][i],
|
| 933 |
conf=round(float(ocr_result["conf"][i]), 0),
|
|
|
|
| 934 |
# line_number=ocr_result['abs_line_id'][i]
|
| 935 |
)
|
| 936 |
for i in valid_indices
|
|
@@ -987,8 +1502,6 @@ class CustomImageAnalyzerEngine:
|
|
| 987 |
if language_supported_entities:
|
| 988 |
text_analyzer_kwargs["entities"] = language_supported_entities
|
| 989 |
|
| 990 |
-
# if language != "en":
|
| 991 |
-
# gr.Info(f"Using {str(language_supported_entities)} entities for local model analysis for language: {language}")
|
| 992 |
else:
|
| 993 |
print(f"No relevant entities supported for language: {language}")
|
| 994 |
raise Warning(
|
|
@@ -1944,6 +2457,7 @@ def create_ocr_result_with_children(
|
|
| 1944 |
word.top + word.height,
|
| 1945 |
),
|
| 1946 |
"conf": word.conf,
|
|
|
|
| 1947 |
}
|
| 1948 |
for word in current_line
|
| 1949 |
],
|
|
|
|
| 17 |
|
| 18 |
from tools.config import (
|
| 19 |
AWS_PII_OPTION,
|
| 20 |
+
CONVERT_LINE_TO_WORD_LEVEL,
|
| 21 |
DEFAULT_LANGUAGE,
|
| 22 |
HYBRID_OCR_CONFIDENCE_THRESHOLD,
|
| 23 |
HYBRID_OCR_PADDING,
|
| 24 |
LOCAL_OCR_MODEL_OPTIONS,
|
| 25 |
LOCAL_PII_OPTION,
|
| 26 |
+
MAX_NEW_TOKENS,
|
| 27 |
OUTPUT_FOLDER,
|
| 28 |
PADDLE_DET_DB_UNCLIP_RATIO,
|
| 29 |
PADDLE_MODEL_PATH,
|
| 30 |
PADDLE_USE_TEXTLINE_ORIENTATION,
|
| 31 |
PREPROCESS_LOCAL_OCR_IMAGES,
|
| 32 |
+
SAVE_EXAMPLE_HYBRID_IMAGES,
|
| 33 |
SAVE_PADDLE_VISUALISATIONS,
|
| 34 |
+
SAVE_TESSERACT_VISUALISATIONS,
|
| 35 |
+
SELECTED_MODEL,
|
| 36 |
+
TESSERACT_SEGMENTATION_LEVEL,
|
| 37 |
)
|
| 38 |
from tools.helper_functions import clean_unicode_text
|
| 39 |
from tools.load_spacy_model_custom_recognisers import custom_entities
|
| 40 |
from tools.presidio_analyzer_custom import recognizer_result_from_dict
|
| 41 |
+
from tools.run_vlm import generate_image as vlm_generate_image
|
| 42 |
from tools.secure_path_utils import validate_folder_containment
|
| 43 |
from tools.secure_regex_utils import safe_sanitize_text
|
| 44 |
|
|
|
|
| 183 |
height: int
|
| 184 |
conf: float = None
|
| 185 |
line: int = None
|
| 186 |
+
model: str = None # Track which OCR model was used (e.g., "Tesseract", "Paddle", "VLM")
|
| 187 |
|
| 188 |
|
| 189 |
@dataclass
|
|
|
|
| 375 |
adjusted_contrast = contrast
|
| 376 |
return adjusted_image, contrast, adjusted_contrast
|
| 377 |
|
| 378 |
+
def _deskew(self, image_np: np.ndarray) -> np.ndarray:
|
| 379 |
+
"""
|
| 380 |
+
Corrects the skew of an image.
|
| 381 |
+
This method works best on a grayscaled image.
|
| 382 |
+
"""
|
| 383 |
+
# We'll work with a copy for angle detection
|
| 384 |
+
gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) if len(image_np.shape) == 3 else image_np.copy()
|
| 385 |
+
|
| 386 |
+
# Invert the image for contour finding
|
| 387 |
+
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
|
| 388 |
+
|
| 389 |
+
coords = np.column_stack(np.where(thresh > 0))
|
| 390 |
+
angle = cv2.minAreaRect(coords)[-1]
|
| 391 |
+
|
| 392 |
+
# Adjust the angle for rotation
|
| 393 |
+
if angle < -45:
|
| 394 |
+
angle = -(90 + angle)
|
| 395 |
+
else:
|
| 396 |
+
angle = -angle
|
| 397 |
+
|
| 398 |
+
# Don't rotate if the angle is negligible
|
| 399 |
+
if abs(angle) < 0.1:
|
| 400 |
+
return image_np
|
| 401 |
+
|
| 402 |
+
(h, w) = image_np.shape[:2]
|
| 403 |
+
center = (w // 2, h // 2)
|
| 404 |
+
M = cv2.getRotationMatrix2D(center, angle, 1.0)
|
| 405 |
+
|
| 406 |
+
# Use the original numpy image for the rotation to preserve quality
|
| 407 |
+
rotated = cv2.warpAffine(
|
| 408 |
+
image_np, M, (w, h),
|
| 409 |
+
flags=cv2.INTER_CUBIC,
|
| 410 |
+
borderMode=cv2.BORDER_REPLICATE
|
| 411 |
+
)
|
| 412 |
+
|
| 413 |
+
return rotated
|
| 414 |
+
|
| 415 |
def preprocess_image(
|
| 416 |
+
self,
|
| 417 |
+
image: Image.Image,
|
| 418 |
+
perform_deskew: bool = False,
|
| 419 |
+
perform_binarization: bool = False,
|
| 420 |
) -> Tuple[Image.Image, dict]:
|
| 421 |
"""
|
| 422 |
+
A pipeline for OCR preprocessing.
|
| 423 |
+
Order: Deskew -> Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
|
|
|
|
|
|
|
| 424 |
"""
|
| 425 |
+
# 1. Convert PIL image to NumPy array for OpenCV processing
|
| 426 |
+
# Assuming the original image is RGB
|
| 427 |
+
image_np = np.array(image.convert("RGB"))
|
| 428 |
+
# OpenCV uses BGR, so we convert RGB to BGR
|
| 429 |
+
image_np_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
|
| 430 |
+
|
| 431 |
+
# --- REVISED PIPELINE ---
|
| 432 |
+
|
| 433 |
+
# 2. Deskew the image (critical new step)
|
| 434 |
+
# This is best done early on the full-quality image.
|
| 435 |
+
if perform_deskew:
|
| 436 |
+
deskewed_image_np = self._deskew(image_np_bgr)
|
| 437 |
+
else:
|
| 438 |
+
deskewed_image_np = image_np_bgr
|
| 439 |
+
|
| 440 |
+
# 3. Convert to greyscale
|
| 441 |
+
# Your convert_image_to_array probably does this, but for clarity:
|
| 442 |
+
gray_image_np = cv2.cvtColor(deskewed_image_np, cv2.COLOR_BGR2GRAY)
|
| 443 |
+
|
| 444 |
+
# 4. Rescale image to optimal DPI
|
| 445 |
+
# Assuming your image_rescaling object can handle a greyscale numpy array
|
| 446 |
rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
|
| 447 |
+
gray_image_np
|
| 448 |
)
|
| 449 |
|
| 450 |
+
# 5. Apply filtering for noise reduction
|
| 451 |
+
# Suggestion: A Median filter is often very effective for scanned docs
|
| 452 |
+
# filtered_image_np = cv2.medianBlur(rescaled_image_np, 3)
|
| 453 |
+
# Or using your existing bilateral filter:
|
| 454 |
filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
|
| 455 |
|
| 456 |
+
# 6. Improve contrast
|
| 457 |
adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
|
| 458 |
|
| 459 |
+
# 7. Adaptive Thresholding (Binarization) - Final optional step
|
| 460 |
if perform_binarization:
|
| 461 |
final_image_np, threshold_metadata = (
|
| 462 |
self.adaptive_threshold.preprocess_image(adjusted_image_np)
|
|
|
|
| 469 |
final_metadata = {**scale_metadata, **threshold_metadata}
|
| 470 |
|
| 471 |
# Convert final numpy array back to PIL Image for return
|
| 472 |
+
# The final image is greyscale, so it's safe to use 'L' mode
|
| 473 |
+
return Image.fromarray(final_image_np).convert('L'), final_metadata
|
| 474 |
|
| 475 |
|
| 476 |
def rescale_ocr_data(ocr_data, scale_factor: float):
|
|
|
|
| 513 |
print(f"No entities provided for language: {language}")
|
| 514 |
# raise Warning(f"No entities provided for language: {language}")
|
| 515 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
filtered_entities = [
|
| 517 |
entity for entity in entities if entity in valid_language_entities
|
| 518 |
]
|
|
|
|
| 529 |
return filtered_entities
|
| 530 |
|
| 531 |
|
| 532 |
+
def _get_tesseract_psm(segmentation_level: str) -> int:
|
| 533 |
+
"""
|
| 534 |
+
Get the appropriate Tesseract PSM (Page Segmentation Mode) value based on segmentation level.
|
| 535 |
+
|
| 536 |
+
Args:
|
| 537 |
+
segmentation_level: "word" or "line"
|
| 538 |
+
|
| 539 |
+
Returns:
|
| 540 |
+
PSM value for Tesseract configuration
|
| 541 |
+
"""
|
| 542 |
+
if segmentation_level.lower() == "line":
|
| 543 |
+
return 6 # Uniform block of text
|
| 544 |
+
elif segmentation_level.lower() == "word":
|
| 545 |
+
return 11 # Sparse text (word-level)
|
| 546 |
+
else:
|
| 547 |
+
print(
|
| 548 |
+
f"Warning: Unknown segmentation level '{segmentation_level}', defaulting to word-level (PSM 11)"
|
| 549 |
+
)
|
| 550 |
+
return 11
|
| 551 |
+
|
| 552 |
+
|
| 553 |
+
def _vlm_ocr_predict(
|
| 554 |
+
image: Image.Image,
|
| 555 |
+
prompt: str = "Extract all text from this image. Return only the text, no other information.",
|
| 556 |
+
) -> Dict[str, Any]:
|
| 557 |
+
"""
|
| 558 |
+
VLM OCR prediction function that mimics PaddleOCR's interface.
|
| 559 |
+
|
| 560 |
+
Args:
|
| 561 |
+
image: PIL Image to process
|
| 562 |
+
prompt: Text prompt for the VLM
|
| 563 |
+
|
| 564 |
+
Returns:
|
| 565 |
+
Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
|
| 566 |
+
"""
|
| 567 |
+
try:
|
| 568 |
+
# Use the VLM to extract text
|
| 569 |
+
extracted_text = vlm_generate_image(
|
| 570 |
+
text=prompt,
|
| 571 |
+
image=image,
|
| 572 |
+
max_new_tokens=MAX_NEW_TOKENS,
|
| 573 |
+
temperature=0.7,
|
| 574 |
+
top_p=0.9,
|
| 575 |
+
top_k=50,
|
| 576 |
+
repetition_penalty=1.3,
|
| 577 |
+
)
|
| 578 |
+
|
| 579 |
+
if extracted_text and extracted_text.strip():
|
| 580 |
+
# Clean the text
|
| 581 |
+
cleaned_text = extracted_text.strip()
|
| 582 |
+
|
| 583 |
+
# Split into words for compatibility with PaddleOCR format
|
| 584 |
+
words = cleaned_text.split()
|
| 585 |
+
|
| 586 |
+
# Create PaddleOCR-compatible result
|
| 587 |
+
result = {
|
| 588 |
+
"rec_texts": words,
|
| 589 |
+
"rec_scores": [0.95] * len(words), # High confidence for VLM results
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
return result
|
| 593 |
+
else:
|
| 594 |
+
return {"rec_texts": [], "rec_scores": []}
|
| 595 |
+
|
| 596 |
+
except Exception as e:
|
| 597 |
+
print(f"VLM OCR error: {e}")
|
| 598 |
+
return {"rec_texts": [], "rec_scores": []}
|
| 599 |
+
|
| 600 |
+
|
| 601 |
class CustomImageAnalyzerEngine:
|
| 602 |
def __init__(
|
| 603 |
self,
|
|
|
|
| 612 |
"""
|
| 613 |
Initializes the CustomImageAnalyzerEngine.
|
| 614 |
|
| 615 |
+
:param ocr_engine: The OCR engine to use ("tesseract", "hybrid-paddle", "hybrid-vlm", or "paddle").
|
| 616 |
:param analyzer_engine: The Presidio AnalyzerEngine instance.
|
| 617 |
+
:param tesseract_config: Configuration string for Tesseract. If None, uses TESSERACT_SEGMENTATION_LEVEL config.
|
| 618 |
:param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
|
| 619 |
:param image_preprocessor: Optional image preprocessor.
|
| 620 |
:param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
|
|
|
|
| 642 |
)
|
| 643 |
self.output_folder = normalized_output_folder
|
| 644 |
|
| 645 |
+
if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle":
|
| 646 |
if PaddleOCR is None:
|
| 647 |
raise ImportError(
|
| 648 |
"paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle' in your python environment and retry."
|
|
|
|
| 669 |
paddle_kwargs.setdefault("lang", self.paddle_lang)
|
| 670 |
self.paddle_ocr = PaddleOCR(**paddle_kwargs)
|
| 671 |
|
| 672 |
+
elif self.ocr_engine == "hybrid-vlm":
|
| 673 |
+
# VLM-based hybrid OCR - no additional initialization needed
|
| 674 |
+
# The VLM model is loaded when run_vlm.py is imported
|
| 675 |
+
print(f"Initializing hybrid VLM OCR with model: {SELECTED_MODEL}")
|
| 676 |
+
self.paddle_ocr = None # Not using PaddleOCR
|
| 677 |
+
|
| 678 |
if not analyzer_engine:
|
| 679 |
analyzer_engine = AnalyzerEngine()
|
| 680 |
self.analyzer_engine = analyzer_engine
|
| 681 |
|
| 682 |
+
# Set Tesseract configuration based on segmentation level
|
| 683 |
+
if tesseract_config:
|
| 684 |
+
self.tesseract_config = tesseract_config
|
| 685 |
+
else:
|
| 686 |
+
psm_value = _get_tesseract_psm(TESSERACT_SEGMENTATION_LEVEL)
|
| 687 |
+
self.tesseract_config = f"--oem 3 --psm {psm_value}"
|
| 688 |
+
# print(
|
| 689 |
+
# f"Tesseract configured for {TESSERACT_SEGMENTATION_LEVEL}-level segmentation (PSM {psm_value})"
|
| 690 |
+
# )
|
| 691 |
|
| 692 |
if not image_preprocessor:
|
| 693 |
image_preprocessor = ContrastSegmentedImageEnhancer()
|
| 694 |
self.image_preprocessor = image_preprocessor
|
| 695 |
|
| 696 |
+
def _sanitize_filename(
|
| 697 |
+
self, text: str, max_length: int = 20, fallback_prefix: str = "unknown_text"
|
| 698 |
+
) -> str:
|
| 699 |
"""
|
| 700 |
Sanitizes text for use in filenames by removing invalid characters and limiting length.
|
| 701 |
|
| 702 |
:param text: The text to sanitize
|
| 703 |
:param max_length: Maximum length of the sanitized text
|
| 704 |
+
:param fallback_prefix: Prefix to use if sanitization fails
|
| 705 |
:return: Sanitized text safe for filenames
|
| 706 |
"""
|
| 707 |
|
|
|
|
| 716 |
|
| 717 |
# If empty after sanitization, use a default value
|
| 718 |
if not sanitized:
|
| 719 |
+
sanitized = fallback_prefix
|
| 720 |
|
| 721 |
# Limit to max_length characters
|
| 722 |
if len(sanitized) > max_length:
|
|
|
|
| 724 |
# Ensure we don't end with an underscore if we cut in the middle
|
| 725 |
sanitized = sanitized.rstrip("_")
|
| 726 |
|
| 727 |
+
# Final check: if still empty or too short, use fallback
|
| 728 |
+
if not sanitized or len(sanitized) < 3:
|
| 729 |
+
sanitized = fallback_prefix
|
| 730 |
+
|
| 731 |
return sanitized
|
| 732 |
|
| 733 |
+
def _create_safe_filename_with_confidence(
|
| 734 |
+
self,
|
| 735 |
+
original_text: str,
|
| 736 |
+
new_text: str,
|
| 737 |
+
conf: int,
|
| 738 |
+
new_conf: int,
|
| 739 |
+
ocr_type: str = "OCR",
|
| 740 |
+
) -> str:
|
| 741 |
+
"""
|
| 742 |
+
Creates a safe filename using confidence values when text sanitization fails.
|
| 743 |
+
|
| 744 |
+
Args:
|
| 745 |
+
original_text: Original text from Tesseract
|
| 746 |
+
new_text: New text from VLM/PaddleOCR
|
| 747 |
+
conf: Original confidence score
|
| 748 |
+
new_conf: New confidence score
|
| 749 |
+
ocr_type: Type of OCR used (VLM, Paddle, etc.)
|
| 750 |
+
|
| 751 |
+
Returns:
|
| 752 |
+
Safe filename string
|
| 753 |
+
"""
|
| 754 |
+
# Try to sanitize both texts
|
| 755 |
+
safe_original = self._sanitize_filename(
|
| 756 |
+
original_text, max_length=15, fallback_prefix=f"orig_conf_{conf}"
|
| 757 |
+
)
|
| 758 |
+
safe_new = self._sanitize_filename(
|
| 759 |
+
new_text, max_length=15, fallback_prefix=f"new_conf_{new_conf}"
|
| 760 |
+
)
|
| 761 |
+
|
| 762 |
+
# If both sanitizations resulted in fallback names, create a confidence-based name
|
| 763 |
+
if safe_original.startswith("unknown_text") and safe_new.startswith(
|
| 764 |
+
"unknown_text"
|
| 765 |
+
):
|
| 766 |
+
return f"{ocr_type}_conf_{conf}_to_conf_{new_conf}"
|
| 767 |
+
|
| 768 |
+
return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
|
| 769 |
+
|
| 770 |
+
def _convert_line_to_word_level(
|
| 771 |
+
self, line_data: Dict[str, List], image_width: int, image_height: int
|
| 772 |
+
) -> Dict[str, List]:
|
| 773 |
+
"""
|
| 774 |
+
Converts line-level OCR results to word-level results by splitting text and estimating word positions.
|
| 775 |
+
|
| 776 |
+
Args:
|
| 777 |
+
line_data: Dictionary with line-level OCR data (text, left, top, width, height, conf)
|
| 778 |
+
image_width: Width of the original image
|
| 779 |
+
image_height: Height of the original image
|
| 780 |
+
|
| 781 |
+
Returns:
|
| 782 |
+
Dictionary with word-level OCR data in Tesseract format
|
| 783 |
+
"""
|
| 784 |
+
output = {
|
| 785 |
+
"text": list(),
|
| 786 |
+
"left": list(),
|
| 787 |
+
"top": list(),
|
| 788 |
+
"width": list(),
|
| 789 |
+
"height": list(),
|
| 790 |
+
"conf": list(),
|
| 791 |
+
}
|
| 792 |
+
|
| 793 |
+
if not line_data or not line_data.get("text"):
|
| 794 |
+
return output
|
| 795 |
+
|
| 796 |
+
for i in range(len(line_data["text"])):
|
| 797 |
+
line_text = line_data["text"][i]
|
| 798 |
+
line_left = line_data["left"][i]
|
| 799 |
+
line_top = line_data["top"][i]
|
| 800 |
+
line_width = line_data["width"][i]
|
| 801 |
+
line_height = line_data["height"][i]
|
| 802 |
+
line_conf = line_data["conf"][i]
|
| 803 |
+
|
| 804 |
+
# Skip empty lines
|
| 805 |
+
if not line_text.strip():
|
| 806 |
+
continue
|
| 807 |
+
|
| 808 |
+
# Split line into words
|
| 809 |
+
words = line_text.split()
|
| 810 |
+
if not words:
|
| 811 |
+
continue
|
| 812 |
+
|
| 813 |
+
# Calculate character width for this line
|
| 814 |
+
total_chars = len(line_text)
|
| 815 |
+
avg_char_width = line_width / total_chars if total_chars > 0 else 0
|
| 816 |
+
|
| 817 |
+
current_char_offset = 0
|
| 818 |
+
|
| 819 |
+
for word in words:
|
| 820 |
+
# Calculate word width based on character count
|
| 821 |
+
word_width = float(len(word) * avg_char_width)
|
| 822 |
+
word_left = line_left + float(current_char_offset * avg_char_width)
|
| 823 |
+
|
| 824 |
+
# Ensure word doesn't exceed image boundaries
|
| 825 |
+
word_left = max(0, min(word_left, image_width - word_width))
|
| 826 |
+
word_width = min(word_width, image_width - word_left)
|
| 827 |
+
|
| 828 |
+
output["text"].append(word)
|
| 829 |
+
output["left"].append(word_left)
|
| 830 |
+
output["top"].append(line_top)
|
| 831 |
+
output["width"].append(word_width)
|
| 832 |
+
output["height"].append(line_height)
|
| 833 |
+
output["conf"].append(line_conf)
|
| 834 |
+
|
| 835 |
+
# Update offset for the next word (add word length + 1 for the space)
|
| 836 |
+
current_char_offset += len(word) + 1
|
| 837 |
+
|
| 838 |
+
return output
|
| 839 |
+
|
| 840 |
+
def _is_line_level_data(self, ocr_data: Dict[str, List]) -> bool:
|
| 841 |
+
"""
|
| 842 |
+
Determines if OCR data contains line-level results (multiple words per bounding box).
|
| 843 |
+
|
| 844 |
+
Args:
|
| 845 |
+
ocr_data: Dictionary with OCR data
|
| 846 |
+
|
| 847 |
+
Returns:
|
| 848 |
+
True if data appears to be line-level, False otherwise
|
| 849 |
+
"""
|
| 850 |
+
if not ocr_data or not ocr_data.get("text"):
|
| 851 |
+
return False
|
| 852 |
+
|
| 853 |
+
# Check if any text entries contain multiple words
|
| 854 |
+
for text in ocr_data["text"]:
|
| 855 |
+
if text.strip() and len(text.split()) > 1:
|
| 856 |
+
return True
|
| 857 |
+
|
| 858 |
+
return False
|
| 859 |
+
|
| 860 |
def _convert_paddle_to_tesseract_format(
|
| 861 |
self, paddle_results: List[Any]
|
| 862 |
) -> Dict[str, List]:
|
|
|
|
| 900 |
line_width = float(max(x_coords) - line_left)
|
| 901 |
line_height = float(max(y_coords) - line_top)
|
| 902 |
|
| 903 |
+
# Add line-level data
|
| 904 |
+
output["text"].append(line_text)
|
| 905 |
+
output["left"].append(line_left)
|
| 906 |
+
output["top"].append(line_top)
|
| 907 |
+
output["width"].append(line_width)
|
| 908 |
+
output["height"].append(line_height)
|
| 909 |
+
output["conf"].append(int(line_confidence * 100))
|
| 910 |
|
| 911 |
+
return output
|
|
|
|
|
|
|
|
|
|
| 912 |
|
| 913 |
+
def _visualize_tesseract_bounding_boxes(
|
| 914 |
+
self,
|
| 915 |
+
image: Image.Image,
|
| 916 |
+
ocr_data: Dict[str, List],
|
| 917 |
+
image_name: str = None,
|
| 918 |
+
visualisation_folder: str = "tesseract_visualisations",
|
| 919 |
+
) -> None:
|
| 920 |
+
"""
|
| 921 |
+
Visualizes Tesseract OCR bounding boxes with confidence-based colors and a legend.
|
| 922 |
|
| 923 |
+
Args:
|
| 924 |
+
image: The PIL Image object
|
| 925 |
+
ocr_data: Tesseract OCR data dictionary
|
| 926 |
+
image_name: Optional name for the saved image file
|
| 927 |
+
"""
|
| 928 |
+
if not ocr_data or not ocr_data.get("text"):
|
| 929 |
+
return
|
| 930 |
|
| 931 |
+
# Convert PIL image to OpenCV format
|
| 932 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
|
| 934 |
+
# Get image dimensions
|
| 935 |
+
height, width = image_cv.shape[:2]
|
| 936 |
|
| 937 |
+
# Define confidence ranges and colors
|
| 938 |
+
confidence_ranges = [
|
| 939 |
+
(80, 100, (0, 255, 0), "High (80-100%)"), # Green
|
| 940 |
+
(50, 79, (0, 165, 255), "Medium (50-79%)"), # Orange
|
| 941 |
+
(0, 49, (0, 0, 255), "Low (0-49%)"), # Red
|
| 942 |
+
]
|
| 943 |
+
|
| 944 |
+
# Process each detected text element
|
| 945 |
+
for i in range(len(ocr_data["text"])):
|
| 946 |
+
text = ocr_data["text"][i]
|
| 947 |
+
conf = int(ocr_data["conf"][i])
|
| 948 |
+
|
| 949 |
+
# Skip empty text or invalid confidence
|
| 950 |
+
if not text.strip() or conf == -1:
|
| 951 |
+
continue
|
| 952 |
+
|
| 953 |
+
left = ocr_data["left"][i]
|
| 954 |
+
top = ocr_data["top"][i]
|
| 955 |
+
width_box = ocr_data["width"][i]
|
| 956 |
+
height_box = ocr_data["height"][i]
|
| 957 |
+
|
| 958 |
+
# Calculate bounding box coordinates
|
| 959 |
+
x1 = int(left)
|
| 960 |
+
y1 = int(top)
|
| 961 |
+
x2 = int(left + width_box)
|
| 962 |
+
y2 = int(top + height_box)
|
| 963 |
+
|
| 964 |
+
# Ensure coordinates are within image bounds
|
| 965 |
+
x1 = max(0, min(x1, width))
|
| 966 |
+
y1 = max(0, min(y1, height))
|
| 967 |
+
x2 = max(0, min(x2, width))
|
| 968 |
+
y2 = max(0, min(y2, height))
|
| 969 |
+
|
| 970 |
+
# Skip if bounding box is invalid
|
| 971 |
+
if x2 <= x1 or y2 <= y1:
|
| 972 |
+
continue
|
| 973 |
+
|
| 974 |
+
# Determine color based on confidence score
|
| 975 |
+
color = (0, 0, 255) # Default to red
|
| 976 |
+
for min_conf, max_conf, conf_color, _ in confidence_ranges:
|
| 977 |
+
if min_conf <= conf <= max_conf:
|
| 978 |
+
color = conf_color
|
| 979 |
+
break
|
| 980 |
+
|
| 981 |
+
# Draw bounding box
|
| 982 |
+
cv2.rectangle(image_cv, (x1, y1), (x2, y2), color, 1)
|
| 983 |
+
|
| 984 |
+
# Add legend
|
| 985 |
+
self._add_confidence_legend(image_cv, confidence_ranges)
|
| 986 |
+
|
| 987 |
+
# Save the visualization
|
| 988 |
+
tesseract_viz_folder = os.path.join(self.output_folder, visualisation_folder)
|
| 989 |
+
|
| 990 |
+
# Double-check the constructed path is safe
|
| 991 |
+
if not validate_folder_containment(tesseract_viz_folder, OUTPUT_FOLDER):
|
| 992 |
+
raise ValueError(
|
| 993 |
+
f"Unsafe tesseract visualisations folder path: {tesseract_viz_folder}"
|
| 994 |
+
)
|
| 995 |
+
|
| 996 |
+
os.makedirs(tesseract_viz_folder, exist_ok=True)
|
| 997 |
+
|
| 998 |
+
# Generate filename
|
| 999 |
+
if image_name:
|
| 1000 |
+
# Remove file extension if present
|
| 1001 |
+
base_name = os.path.splitext(image_name)[0]
|
| 1002 |
+
filename = f"{base_name}_{visualisation_folder}.jpg"
|
| 1003 |
+
else:
|
| 1004 |
+
timestamp = int(time.time())
|
| 1005 |
+
filename = f"{visualisation_folder}_{timestamp}.jpg"
|
| 1006 |
+
|
| 1007 |
+
output_path = os.path.join(tesseract_viz_folder, filename)
|
| 1008 |
+
|
| 1009 |
+
# Save the image
|
| 1010 |
+
cv2.imwrite(output_path, image_cv)
|
| 1011 |
+
print(f"Tesseract visualization saved to: {output_path}")
|
| 1012 |
+
|
| 1013 |
+
def _add_confidence_legend(
|
| 1014 |
+
self, image_cv: np.ndarray, confidence_ranges: List[Tuple]
|
| 1015 |
+
) -> None:
|
| 1016 |
+
"""
|
| 1017 |
+
Adds a confidence legend to the visualization image.
|
| 1018 |
+
|
| 1019 |
+
Args:
|
| 1020 |
+
image_cv: OpenCV image array
|
| 1021 |
+
confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
|
| 1022 |
+
"""
|
| 1023 |
+
height, width = image_cv.shape[:2]
|
| 1024 |
+
|
| 1025 |
+
# Legend parameters
|
| 1026 |
+
legend_width = 200
|
| 1027 |
+
legend_height = 100
|
| 1028 |
+
legend_x = width - legend_width - 20
|
| 1029 |
+
legend_y = 20
|
| 1030 |
+
|
| 1031 |
+
# Draw legend background
|
| 1032 |
+
cv2.rectangle(
|
| 1033 |
+
image_cv,
|
| 1034 |
+
(legend_x, legend_y),
|
| 1035 |
+
(legend_x + legend_width, legend_y + legend_height),
|
| 1036 |
+
(255, 255, 255), # White background
|
| 1037 |
+
-1,
|
| 1038 |
+
)
|
| 1039 |
+
cv2.rectangle(
|
| 1040 |
+
image_cv,
|
| 1041 |
+
(legend_x, legend_y),
|
| 1042 |
+
(legend_x + legend_width, legend_y + legend_height),
|
| 1043 |
+
(0, 0, 0), # Black border
|
| 1044 |
+
2,
|
| 1045 |
+
)
|
| 1046 |
+
|
| 1047 |
+
# Add title
|
| 1048 |
+
title_text = "Confidence Levels"
|
| 1049 |
+
font_scale = 0.6
|
| 1050 |
+
font_thickness = 2
|
| 1051 |
+
(title_width, title_height), _ = cv2.getTextSize(
|
| 1052 |
+
title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
|
| 1053 |
+
)
|
| 1054 |
+
title_x = legend_x + (legend_width - title_width) // 2
|
| 1055 |
+
title_y = legend_y + title_height + 10
|
| 1056 |
+
cv2.putText(
|
| 1057 |
+
image_cv,
|
| 1058 |
+
title_text,
|
| 1059 |
+
(title_x, title_y),
|
| 1060 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 1061 |
+
font_scale,
|
| 1062 |
+
(0, 0, 0), # Black text
|
| 1063 |
+
font_thickness,
|
| 1064 |
+
)
|
| 1065 |
+
|
| 1066 |
+
# Add confidence range items
|
| 1067 |
+
item_spacing = 25
|
| 1068 |
+
start_y = title_y + 25
|
| 1069 |
+
|
| 1070 |
+
for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
|
| 1071 |
+
item_y = start_y + i * item_spacing
|
| 1072 |
+
|
| 1073 |
+
# Draw color box
|
| 1074 |
+
box_size = 15
|
| 1075 |
+
box_x = legend_x + 10
|
| 1076 |
+
box_y = item_y - box_size
|
| 1077 |
+
cv2.rectangle(
|
| 1078 |
+
image_cv,
|
| 1079 |
+
(box_x, box_y),
|
| 1080 |
+
(box_x + box_size, box_y + box_size),
|
| 1081 |
+
color,
|
| 1082 |
+
-1,
|
| 1083 |
+
)
|
| 1084 |
+
cv2.rectangle(
|
| 1085 |
+
image_cv,
|
| 1086 |
+
(box_x, box_y),
|
| 1087 |
+
(box_x + box_size, box_y + box_size),
|
| 1088 |
+
(0, 0, 0), # Black border
|
| 1089 |
+
1,
|
| 1090 |
+
)
|
| 1091 |
+
|
| 1092 |
+
# Add label text
|
| 1093 |
+
label_x = box_x + box_size + 10
|
| 1094 |
+
label_y = item_y - 5
|
| 1095 |
+
cv2.putText(
|
| 1096 |
+
image_cv,
|
| 1097 |
+
label,
|
| 1098 |
+
(label_x, label_y),
|
| 1099 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 1100 |
+
0.5,
|
| 1101 |
+
(0, 0, 0), # Black text
|
| 1102 |
+
1,
|
| 1103 |
+
)
|
| 1104 |
|
| 1105 |
def _perform_hybrid_ocr(
|
| 1106 |
self,
|
|
|
|
| 1111 |
image_name: str = "unknown_image_name",
|
| 1112 |
) -> Dict[str, list]:
|
| 1113 |
"""
|
| 1114 |
+
Performs OCR using Tesseract for bounding boxes and PaddleOCR/VLM for low-confidence text.
|
| 1115 |
Returns data in the same dictionary format as pytesseract.image_to_data.
|
| 1116 |
"""
|
| 1117 |
+
# Determine if we're using VLM or PaddleOCR
|
| 1118 |
+
use_vlm = self.ocr_engine == "hybrid-vlm"
|
| 1119 |
+
|
| 1120 |
+
if not use_vlm:
|
| 1121 |
+
if ocr is None:
|
| 1122 |
+
if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
|
| 1123 |
+
ocr = self.paddle_ocr
|
| 1124 |
+
else:
|
| 1125 |
+
raise ValueError(
|
| 1126 |
+
"No OCR object provided and 'paddle_ocr' is not initialized."
|
| 1127 |
+
)
|
| 1128 |
|
| 1129 |
print("Starting hybrid OCR process...")
|
| 1130 |
|
|
|
|
| 1143 |
"width": list(),
|
| 1144 |
"height": list(),
|
| 1145 |
"conf": list(),
|
| 1146 |
+
"model": list(), # Track which model was used for each word
|
| 1147 |
}
|
| 1148 |
|
| 1149 |
num_words = len(tesseract_data["text"])
|
|
|
|
| 1164 |
height = tesseract_data["height"][i]
|
| 1165 |
# line_number = tesseract_data['abs_line_id'][i]
|
| 1166 |
|
| 1167 |
+
# Initialize model as Tesseract (default)
|
| 1168 |
+
model_used = "Tesseract"
|
| 1169 |
+
|
| 1170 |
# If confidence is low, use PaddleOCR for a second opinion
|
| 1171 |
if conf < confidence_threshold:
|
| 1172 |
img_width, img_height = image.size
|
|
|
|
| 1182 |
cropped_image = image.crop(
|
| 1183 |
(crop_left, crop_top, crop_right, crop_bottom)
|
| 1184 |
)
|
| 1185 |
+
if use_vlm:
|
| 1186 |
+
# Use VLM for OCR
|
| 1187 |
+
vlm_result = _vlm_ocr_predict(cropped_image)
|
| 1188 |
+
rec_texts = vlm_result.get("rec_texts", [])
|
| 1189 |
+
rec_scores = vlm_result.get("rec_scores", [])
|
| 1190 |
+
else:
|
| 1191 |
+
# Use PaddleOCR
|
| 1192 |
+
cropped_image_np = np.array(cropped_image)
|
| 1193 |
|
| 1194 |
+
if len(cropped_image_np.shape) == 2:
|
| 1195 |
+
cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
|
| 1196 |
|
| 1197 |
+
paddle_results = ocr.predict(cropped_image_np)
|
| 1198 |
|
| 1199 |
+
if paddle_results and paddle_results[0]:
|
| 1200 |
+
rec_texts = paddle_results[0].get("rec_texts", [])
|
| 1201 |
+
rec_scores = paddle_results[0].get("rec_scores", [])
|
| 1202 |
+
else:
|
| 1203 |
+
rec_texts = []
|
| 1204 |
+
rec_scores = []
|
| 1205 |
|
| 1206 |
+
if rec_texts and rec_scores:
|
| 1207 |
+
new_text = " ".join(rec_texts)
|
| 1208 |
+
new_conf = int(round(np.median(rec_scores) * 100, 0))
|
| 1209 |
|
| 1210 |
+
# Only replace if Paddle's/VLM's confidence is better
|
| 1211 |
+
if new_conf > conf:
|
| 1212 |
+
ocr_type = "VLM" if use_vlm else "Paddle"
|
| 1213 |
+
print(
|
| 1214 |
+
f" Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f}) [{ocr_type}]"
|
| 1215 |
+
)
|
| 1216 |
|
| 1217 |
+
# For exporting example image comparisons, not used here
|
| 1218 |
+
safe_filename = self._create_safe_filename_with_confidence(
|
| 1219 |
+
text, new_text, conf, new_conf, ocr_type
|
| 1220 |
+
)
|
| 1221 |
+
|
| 1222 |
+
if SAVE_EXAMPLE_HYBRID_IMAGES is True:
|
| 1223 |
+
# Normalize and validate image_name to prevent path traversal attacks
|
| 1224 |
+
normalized_image_name = os.path.normpath(image_name + "_" + ocr_type)
|
| 1225 |
+
# Ensure the image name doesn't contain path traversal characters
|
| 1226 |
+
if (
|
| 1227 |
+
".." in normalized_image_name
|
| 1228 |
+
or "/" in normalized_image_name
|
| 1229 |
+
or "\\" in normalized_image_name
|
| 1230 |
+
):
|
| 1231 |
+
normalized_image_name = (
|
| 1232 |
+
"safe_image" # Fallback to safe default
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1233 |
)
|
|
|
|
|
|
|
| 1234 |
|
| 1235 |
+
hybrid_ocr_examples_folder = (
|
| 1236 |
+
self.output_folder
|
| 1237 |
+
+ f"/hybrid_ocr_examples/{normalized_image_name}"
|
| 1238 |
+
)
|
| 1239 |
+
# Validate the constructed path is safe before creating directories
|
| 1240 |
+
if not validate_folder_containment(
|
| 1241 |
+
hybrid_ocr_examples_folder, OUTPUT_FOLDER
|
| 1242 |
+
):
|
| 1243 |
+
raise ValueError(
|
| 1244 |
+
f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
|
| 1245 |
+
)
|
| 1246 |
|
| 1247 |
+
if not os.path.exists(hybrid_ocr_examples_folder):
|
| 1248 |
+
os.makedirs(hybrid_ocr_examples_folder)
|
| 1249 |
+
output_image_path = (
|
| 1250 |
+
hybrid_ocr_examples_folder + f"/{safe_filename}.png"
|
| 1251 |
)
|
| 1252 |
+
print(f"Saving example image to {output_image_path}")
|
| 1253 |
+
cropped_image.save(output_image_path)
|
| 1254 |
+
|
| 1255 |
+
text = new_text
|
| 1256 |
+
conf = new_conf
|
| 1257 |
+
model_used = ocr_type # Update model to VLM or Paddle
|
| 1258 |
+
|
| 1259 |
else:
|
| 1260 |
+
ocr_type = "VLM" if use_vlm else "Paddle"
|
| 1261 |
print(
|
| 1262 |
+
f" '{text}' (conf: {conf}) -> {ocr_type} result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
|
| 1263 |
)
|
|
|
|
| 1264 |
else:
|
| 1265 |
+
# OCR ran but found nothing, discard original word
|
| 1266 |
+
ocr_type = "VLM" if use_vlm else "Paddle"
|
| 1267 |
print(
|
| 1268 |
+
f" '{text}' (conf: {conf}) -> No text found by {ocr_type}. Discarding."
|
| 1269 |
)
|
| 1270 |
text = ""
|
| 1271 |
|
|
|
|
| 1277 |
final_data["width"].append(width)
|
| 1278 |
final_data["height"].append(height)
|
| 1279 |
final_data["conf"].append(int(conf))
|
| 1280 |
+
final_data["model"].append(model_used)
|
| 1281 |
# final_data['line_number'].append(int(line_number))
|
| 1282 |
|
| 1283 |
return final_data
|
|
|
|
| 1308 |
image_width, image_height = image.size
|
| 1309 |
|
| 1310 |
# Note: In testing I haven't seen that this necessarily improves results
|
| 1311 |
+
if self.ocr_engine == "hybrid-paddle":
|
| 1312 |
# Try hybrid with original image for cropping:
|
| 1313 |
ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
|
| 1314 |
|
| 1315 |
+
elif self.ocr_engine == "hybrid-vlm":
|
| 1316 |
+
# Try hybrid VLM with original image for cropping:
|
| 1317 |
+
ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
|
| 1318 |
+
|
| 1319 |
elif self.ocr_engine == "tesseract":
|
| 1320 |
|
| 1321 |
ocr_data = pytesseract.image_to_data(
|
|
|
|
| 1325 |
lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
|
| 1326 |
)
|
| 1327 |
|
| 1328 |
+
# Save Tesseract visualization with bounding boxes
|
| 1329 |
+
# if SAVE_TESSERACT_VISUALISATIONS is True:
|
| 1330 |
+
# self._visualize_tesseract_bounding_boxes(
|
| 1331 |
+
# image,
|
| 1332 |
+
# ocr_data,
|
| 1333 |
+
# image_name,
|
| 1334 |
+
# visualisation_folder="tesseract_visualisations",
|
| 1335 |
+
# )
|
| 1336 |
+
|
| 1337 |
elif self.ocr_engine == "paddle":
|
| 1338 |
|
| 1339 |
if ocr is None:
|
|
|
|
| 1385 |
|
| 1386 |
ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
|
| 1387 |
|
| 1388 |
+
# if SAVE_PADDLE_VISUALISATIONS is True:
|
| 1389 |
+
# # Save Paddle visualization with bounding boxes
|
| 1390 |
+
# self._visualize_tesseract_bounding_boxes(
|
| 1391 |
+
# image,
|
| 1392 |
+
# ocr_data,
|
| 1393 |
+
# image_name,
|
| 1394 |
+
# visualisation_folder="paddle_visualisations",
|
| 1395 |
+
# )
|
| 1396 |
+
|
| 1397 |
else:
|
| 1398 |
raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
|
| 1399 |
|
| 1400 |
+
# Convert line-level results to word-level if configured and needed
|
| 1401 |
+
if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
|
| 1402 |
+
print("Converting line-level OCR results to word-level...")
|
| 1403 |
+
ocr_data = self._convert_line_to_word_level(
|
| 1404 |
+
ocr_data, image_width, image_height
|
| 1405 |
+
)
|
| 1406 |
+
|
| 1407 |
+
# Always check for scale_factor, even if preprocessing_metadata is empty
|
| 1408 |
+
# This ensures rescaling happens correctly when preprocessing was applied
|
| 1409 |
+
scale_factor = preprocessing_metadata.get("scale_factor", 1.0) if preprocessing_metadata else 1.0
|
| 1410 |
+
if scale_factor != 1.0:
|
| 1411 |
+
# print(f"Rescaling OCR data by scale factor: {scale_factor} (converting from preprocessed to original image coordinates)")
|
| 1412 |
+
# print(f"OCR data before rescaling (first 3 entries): {dict((k, v[:3] if isinstance(v, list) else v) for k, v in list(ocr_data.items())[:3])}")
|
| 1413 |
ocr_data = rescale_ocr_data(ocr_data, scale_factor)
|
| 1414 |
+
# print(f"OCR data after rescaling (first 3 entries): {dict((k, v[:3] if isinstance(v, list) else v) for k, v in list(ocr_data.items())[:3])}")
|
| 1415 |
|
| 1416 |
# The rest of your processing pipeline now works for both engines
|
| 1417 |
ocr_result = ocr_data
|
|
|
|
| 1423 |
if text.strip() and int(ocr_result["conf"][i]) > 0
|
| 1424 |
]
|
| 1425 |
|
| 1426 |
+
# Determine default model based on OCR engine if model field is not present
|
| 1427 |
+
if "model" in ocr_result and len(ocr_result["model"]) == len(ocr_result["text"]):
|
| 1428 |
+
# Model field exists and has correct length - use it
|
| 1429 |
+
get_model = lambda idx: ocr_result["model"][idx]
|
| 1430 |
+
else:
|
| 1431 |
+
# Model field not present or incorrect length - use default based on engine
|
| 1432 |
+
default_model = (
|
| 1433 |
+
"Tesseract" if self.ocr_engine == "tesseract" else
|
| 1434 |
+
"Paddle" if self.ocr_engine == "paddle" else
|
| 1435 |
+
"hybrid-paddle" if self.ocr_engine == "hybrid-paddle" else
|
| 1436 |
+
"VLM" if self.ocr_engine == "hybrid-vlm" else None
|
| 1437 |
+
)
|
| 1438 |
+
get_model = lambda idx: default_model
|
| 1439 |
+
|
| 1440 |
return [
|
| 1441 |
OCRResult(
|
| 1442 |
text=clean_unicode_text(ocr_result["text"][i]),
|
|
|
|
| 1445 |
width=ocr_result["width"][i],
|
| 1446 |
height=ocr_result["height"][i],
|
| 1447 |
conf=round(float(ocr_result["conf"][i]), 0),
|
| 1448 |
+
model=get_model(i),
|
| 1449 |
# line_number=ocr_result['abs_line_id'][i]
|
| 1450 |
)
|
| 1451 |
for i in valid_indices
|
|
|
|
| 1502 |
if language_supported_entities:
|
| 1503 |
text_analyzer_kwargs["entities"] = language_supported_entities
|
| 1504 |
|
|
|
|
|
|
|
| 1505 |
else:
|
| 1506 |
print(f"No relevant entities supported for language: {language}")
|
| 1507 |
raise Warning(
|
|
|
|
| 2457 |
word.top + word.height,
|
| 2458 |
),
|
| 2459 |
"conf": word.conf,
|
| 2460 |
+
"model": word.model,
|
| 2461 |
}
|
| 2462 |
for word in current_line
|
| 2463 |
],
|
tools/file_redaction.py
CHANGED
|
@@ -8,7 +8,9 @@ from datetime import datetime
|
|
| 8 |
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
|
| 10 |
import boto3
|
|
|
|
| 11 |
import gradio as gr
|
|
|
|
| 12 |
import pandas as pd
|
| 13 |
import pymupdf
|
| 14 |
from gradio import Progress
|
|
@@ -53,11 +55,14 @@ from tools.config import (
|
|
| 53 |
MAX_TIME_VALUE,
|
| 54 |
NO_REDACTION_PII_OPTION,
|
| 55 |
OUTPUT_FOLDER,
|
|
|
|
| 56 |
PAGE_BREAK_VALUE,
|
| 57 |
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
|
| 58 |
RETURN_PDF_FOR_REVIEW,
|
| 59 |
RETURN_REDACTED_PDF,
|
| 60 |
RUN_AWS_FUNCTIONS,
|
|
|
|
|
|
|
| 61 |
SELECTABLE_TEXT_EXTRACT_OPTION,
|
| 62 |
TESSERACT_TEXT_EXTRACT_OPTION,
|
| 63 |
TEXTRACT_TEXT_EXTRACT_OPTION,
|
|
@@ -104,6 +109,7 @@ from tools.load_spacy_model_custom_recognisers import (
|
|
| 104 |
)
|
| 105 |
from tools.secure_path_utils import (
|
| 106 |
secure_file_write,
|
|
|
|
| 107 |
validate_path_containment,
|
| 108 |
)
|
| 109 |
|
|
@@ -322,7 +328,7 @@ def choose_and_run_redactor(
|
|
| 322 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
| 323 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
| 324 |
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
| 325 |
-
- chosen_local_model (str): Which local model is being used for OCR on images - uses the value of CHOSEN_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
|
| 326 |
- language (str, optional): The language of the text in the files. Defaults to English.
|
| 327 |
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
| 328 |
- ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
|
|
@@ -978,8 +984,10 @@ def choose_and_run_redactor(
|
|
| 978 |
)
|
| 979 |
|
| 980 |
if not all_page_line_level_ocr_results_with_words:
|
| 981 |
-
if
|
| 982 |
-
|
|
|
|
|
|
|
| 983 |
):
|
| 984 |
(
|
| 985 |
all_page_line_level_ocr_results_with_words,
|
|
@@ -1010,7 +1018,7 @@ def choose_and_run_redactor(
|
|
| 1010 |
(
|
| 1011 |
pymupdf_doc,
|
| 1012 |
all_pages_decision_process_table,
|
| 1013 |
-
|
| 1014 |
new_textract_request_metadata,
|
| 1015 |
annotations_all_pages,
|
| 1016 |
current_loop_page,
|
|
@@ -3118,7 +3126,7 @@ def redact_image_pdf(
|
|
| 3118 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
| 3119 |
- all_page_line_level_ocr_results (optional): List of all page line level OCR results.
|
| 3120 |
- all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
|
| 3121 |
-
- chosen_local_model (str, optional): The local model chosen for OCR. Defaults to CHOSEN_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
|
| 3122 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
|
| 3123 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
| 3124 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
@@ -3207,11 +3215,16 @@ def redact_image_pdf(
|
|
| 3207 |
# If running Textract, check if file already exists. If it does, load in existing data
|
| 3208 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 3209 |
textract_json_file_path = output_folder + file_name + "_textract.json"
|
| 3210 |
-
|
| 3211 |
-
|
| 3212 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3213 |
)
|
| 3214 |
-
)
|
| 3215 |
original_textract_data = textract_data.copy()
|
| 3216 |
|
| 3217 |
# print("Successfully loaded in Textract analysis results from file")
|
|
@@ -3221,15 +3234,20 @@ def redact_image_pdf(
|
|
| 3221 |
all_page_line_level_ocr_results_with_words_json_file_path = (
|
| 3222 |
output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
|
| 3223 |
)
|
| 3224 |
-
|
| 3225 |
-
|
| 3226 |
-
|
| 3227 |
-
|
| 3228 |
-
|
| 3229 |
-
|
| 3230 |
-
|
| 3231 |
-
|
| 3232 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3233 |
original_all_page_line_level_ocr_results_with_words = (
|
| 3234 |
all_page_line_level_ocr_results_with_words.copy()
|
| 3235 |
)
|
|
@@ -3536,6 +3554,24 @@ def redact_image_pdf(
|
|
| 3536 |
line_level_ocr_results_df.to_dict("records")
|
| 3537 |
)
|
| 3538 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3539 |
if (
|
| 3540 |
pii_identification_method != NO_REDACTION_PII_OPTION
|
| 3541 |
or RETURN_PDF_FOR_REVIEW is True
|
|
@@ -4867,3 +4903,395 @@ def redact_text_pdf(
|
|
| 4867 |
comprehend_query_number,
|
| 4868 |
all_page_line_level_ocr_results_with_words,
|
| 4869 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
from typing import Any, Dict, List, Optional, Tuple
|
| 9 |
|
| 10 |
import boto3
|
| 11 |
+
import cv2
|
| 12 |
import gradio as gr
|
| 13 |
+
import numpy as np
|
| 14 |
import pandas as pd
|
| 15 |
import pymupdf
|
| 16 |
from gradio import Progress
|
|
|
|
| 55 |
MAX_TIME_VALUE,
|
| 56 |
NO_REDACTION_PII_OPTION,
|
| 57 |
OUTPUT_FOLDER,
|
| 58 |
+
OVERWRITE_EXISTING_OCR_RESULTS,
|
| 59 |
PAGE_BREAK_VALUE,
|
| 60 |
PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
|
| 61 |
RETURN_PDF_FOR_REVIEW,
|
| 62 |
RETURN_REDACTED_PDF,
|
| 63 |
RUN_AWS_FUNCTIONS,
|
| 64 |
+
SAVE_TEXTRACT_VISUALISATIONS,
|
| 65 |
+
SAVE_TESSERACT_VISUALISATIONS,
|
| 66 |
SELECTABLE_TEXT_EXTRACT_OPTION,
|
| 67 |
TESSERACT_TEXT_EXTRACT_OPTION,
|
| 68 |
TEXTRACT_TEXT_EXTRACT_OPTION,
|
|
|
|
| 109 |
)
|
| 110 |
from tools.secure_path_utils import (
|
| 111 |
secure_file_write,
|
| 112 |
+
validate_folder_containment,
|
| 113 |
validate_path_containment,
|
| 114 |
)
|
| 115 |
|
|
|
|
| 328 |
- all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
|
| 329 |
- all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
|
| 330 |
- all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
|
| 331 |
+
- chosen_local_model (str): Which local model is being used for OCR on images - uses the value of CHOSEN_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid-paddle" to combine both.
|
| 332 |
- language (str, optional): The language of the text in the files. Defaults to English.
|
| 333 |
- language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
|
| 334 |
- ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
|
|
|
|
| 984 |
)
|
| 985 |
|
| 986 |
if not all_page_line_level_ocr_results_with_words:
|
| 987 |
+
if (
|
| 988 |
+
not OVERWRITE_EXISTING_OCR_RESULTS
|
| 989 |
+
and local_ocr_output_found_checkbox is True
|
| 990 |
+
and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path)
|
| 991 |
):
|
| 992 |
(
|
| 993 |
all_page_line_level_ocr_results_with_words,
|
|
|
|
| 1018 |
(
|
| 1019 |
pymupdf_doc,
|
| 1020 |
all_pages_decision_process_table,
|
| 1021 |
+
log_files_output_paths,
|
| 1022 |
new_textract_request_metadata,
|
| 1023 |
annotations_all_pages,
|
| 1024 |
current_loop_page,
|
|
|
|
| 3126 |
- text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
|
| 3127 |
- all_page_line_level_ocr_results (optional): List of all page line level OCR results.
|
| 3128 |
- all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
|
| 3129 |
+
- chosen_local_model (str, optional): The local model chosen for OCR. Defaults to CHOSEN_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid-paddle" for a combination of both.
|
| 3130 |
- page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
|
| 3131 |
- log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
|
| 3132 |
- max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
|
|
|
|
| 3215 |
# If running Textract, check if file already exists. If it does, load in existing data
|
| 3216 |
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 3217 |
textract_json_file_path = output_folder + file_name + "_textract.json"
|
| 3218 |
+
if OVERWRITE_EXISTING_OCR_RESULTS:
|
| 3219 |
+
# Skip loading existing results, start fresh
|
| 3220 |
+
textract_data = {}
|
| 3221 |
+
is_missing = True
|
| 3222 |
+
else:
|
| 3223 |
+
textract_data, is_missing, log_files_output_paths = (
|
| 3224 |
+
load_and_convert_textract_json(
|
| 3225 |
+
textract_json_file_path, log_files_output_paths, page_sizes_df
|
| 3226 |
+
)
|
| 3227 |
)
|
|
|
|
| 3228 |
original_textract_data = textract_data.copy()
|
| 3229 |
|
| 3230 |
# print("Successfully loaded in Textract analysis results from file")
|
|
|
|
| 3234 |
all_page_line_level_ocr_results_with_words_json_file_path = (
|
| 3235 |
output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
|
| 3236 |
)
|
| 3237 |
+
if OVERWRITE_EXISTING_OCR_RESULTS:
|
| 3238 |
+
# Skip loading existing results, start fresh
|
| 3239 |
+
all_page_line_level_ocr_results_with_words = []
|
| 3240 |
+
is_missing = True
|
| 3241 |
+
else:
|
| 3242 |
+
(
|
| 3243 |
+
all_page_line_level_ocr_results_with_words,
|
| 3244 |
+
is_missing,
|
| 3245 |
+
log_files_output_paths,
|
| 3246 |
+
) = load_and_convert_ocr_results_with_words_json(
|
| 3247 |
+
all_page_line_level_ocr_results_with_words_json_file_path,
|
| 3248 |
+
log_files_output_paths,
|
| 3249 |
+
page_sizes_df,
|
| 3250 |
+
)
|
| 3251 |
original_all_page_line_level_ocr_results_with_words = (
|
| 3252 |
all_page_line_level_ocr_results_with_words.copy()
|
| 3253 |
)
|
|
|
|
| 3554 |
line_level_ocr_results_df.to_dict("records")
|
| 3555 |
)
|
| 3556 |
|
| 3557 |
+
# Save OCR visualization with bounding boxes (works for all OCR methods)
|
| 3558 |
+
if (
|
| 3559 |
+
text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
|
| 3560 |
+
and SAVE_TEXTRACT_VISUALISATIONS is True
|
| 3561 |
+
) or (
|
| 3562 |
+
text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
|
| 3563 |
+
and SAVE_TESSERACT_VISUALISATIONS is True
|
| 3564 |
+
):
|
| 3565 |
+
if page_line_level_ocr_results_with_words and "results" in page_line_level_ocr_results_with_words:
|
| 3566 |
+
log_files_output_paths = visualise_ocr_words_bounding_boxes(
|
| 3567 |
+
image,
|
| 3568 |
+
page_line_level_ocr_results_with_words["results"],
|
| 3569 |
+
image_name=f"{file_name}_{reported_page_number}",
|
| 3570 |
+
output_folder=output_folder,
|
| 3571 |
+
text_extraction_method=text_extraction_method,
|
| 3572 |
+
log_files_output_paths=log_files_output_paths,
|
| 3573 |
+
)
|
| 3574 |
+
|
| 3575 |
if (
|
| 3576 |
pii_identification_method != NO_REDACTION_PII_OPTION
|
| 3577 |
or RETURN_PDF_FOR_REVIEW is True
|
|
|
|
| 4903 |
comprehend_query_number,
|
| 4904 |
all_page_line_level_ocr_results_with_words,
|
| 4905 |
)
|
| 4906 |
+
|
| 4907 |
+
|
| 4908 |
+
def visualise_ocr_words_bounding_boxes(
|
| 4909 |
+
image: "Image.Image",
|
| 4910 |
+
ocr_results: Dict[str, Any],
|
| 4911 |
+
image_name: str = None,
|
| 4912 |
+
output_folder: str = OUTPUT_FOLDER,
|
| 4913 |
+
text_extraction_method: str = None,
|
| 4914 |
+
visualisation_folder: str = None,
|
| 4915 |
+
add_legend: bool = True,
|
| 4916 |
+
log_files_output_paths: List[str] = [],
|
| 4917 |
+
) -> None:
|
| 4918 |
+
"""
|
| 4919 |
+
Visualizes OCR bounding boxes with confidence-based colors and a legend.
|
| 4920 |
+
Handles word-level OCR results from Textract and Tesseract.
|
| 4921 |
+
|
| 4922 |
+
Args:
|
| 4923 |
+
image: The PIL Image object
|
| 4924 |
+
ocr_results: Dictionary containing word-level OCR results
|
| 4925 |
+
image_name: Optional name for the saved image file
|
| 4926 |
+
output_folder: Output folder path
|
| 4927 |
+
text_extraction_method: The text extraction method being used (determines folder name)
|
| 4928 |
+
visualisation_folder: Subfolder name for visualizations (auto-determined if not provided)
|
| 4929 |
+
add_legend: Whether to add a legend to the visualization
|
| 4930 |
+
log_files_output_paths: List of file paths used for saving redaction process logging results.
|
| 4931 |
+
"""
|
| 4932 |
+
# Determine visualization folder based on text extraction method
|
| 4933 |
+
if visualisation_folder is None:
|
| 4934 |
+
if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
|
| 4935 |
+
visualisation_folder = "textract_visualisations"
|
| 4936 |
+
elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
|
| 4937 |
+
visualisation_folder = "tesseract_visualisations"
|
| 4938 |
+
else:
|
| 4939 |
+
visualisation_folder = "ocr_visualisations"
|
| 4940 |
+
if not ocr_results:
|
| 4941 |
+
return
|
| 4942 |
+
|
| 4943 |
+
# Convert PIL image to OpenCV format
|
| 4944 |
+
image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
| 4945 |
+
|
| 4946 |
+
# Get image dimensions
|
| 4947 |
+
height, width = image_cv.shape[:2]
|
| 4948 |
+
|
| 4949 |
+
# Define confidence ranges and colors for bounding boxes (bright colors)
|
| 4950 |
+
confidence_ranges = [
|
| 4951 |
+
(80, 100, (0, 255, 0), "High (80-100%)"), # Green
|
| 4952 |
+
(50, 79, (0, 165, 255), "Medium (50-79%)"), # Orange
|
| 4953 |
+
(0, 49, (0, 0, 255), "Low (0-49%)"), # Red
|
| 4954 |
+
]
|
| 4955 |
+
|
| 4956 |
+
# Define darker colors for text on white background
|
| 4957 |
+
text_confidence_ranges = [
|
| 4958 |
+
(80, 100, (0, 150, 0), "High (80-100%)"), # Dark Green
|
| 4959 |
+
(50, 79, (0, 100, 200), "Medium (50-79%)"), # Dark Orange
|
| 4960 |
+
(0, 49, (0, 0, 180), "Low (0-49%)"), # Dark Red
|
| 4961 |
+
]
|
| 4962 |
+
|
| 4963 |
+
# Process each line's words
|
| 4964 |
+
for line_key, line_data in ocr_results.items():
|
| 4965 |
+
if not isinstance(line_data, dict) or 'words' not in line_data:
|
| 4966 |
+
continue
|
| 4967 |
+
|
| 4968 |
+
words = line_data.get('words', [])
|
| 4969 |
+
|
| 4970 |
+
# Process each word in the line
|
| 4971 |
+
for word_data in words:
|
| 4972 |
+
if not isinstance(word_data, dict):
|
| 4973 |
+
continue
|
| 4974 |
+
|
| 4975 |
+
text = word_data.get('text', '')
|
| 4976 |
+
# Handle both 'conf' and 'confidence' field names for compatibility
|
| 4977 |
+
conf = int(word_data.get('conf', word_data.get('confidence', 0)))
|
| 4978 |
+
|
| 4979 |
+
# Skip empty text or invalid confidence
|
| 4980 |
+
if not text.strip() or conf == -1:
|
| 4981 |
+
continue
|
| 4982 |
+
|
| 4983 |
+
# Get bounding box coordinates
|
| 4984 |
+
bbox = word_data.get('bounding_box', (0, 0, 0, 0))
|
| 4985 |
+
if len(bbox) != 4:
|
| 4986 |
+
continue
|
| 4987 |
+
|
| 4988 |
+
x1, y1, x2, y2 = bbox
|
| 4989 |
+
|
| 4990 |
+
# Ensure coordinates are within image bounds
|
| 4991 |
+
x1 = max(0, min(int(x1), width))
|
| 4992 |
+
y1 = max(0, min(int(y1), height))
|
| 4993 |
+
x2 = max(0, min(int(x2), width))
|
| 4994 |
+
y2 = max(0, min(int(y2), height))
|
| 4995 |
+
|
| 4996 |
+
# Skip if bounding box is invalid
|
| 4997 |
+
if x2 <= x1 or y2 <= y1:
|
| 4998 |
+
continue
|
| 4999 |
+
|
| 5000 |
+
# Check if word was replaced by a different model
|
| 5001 |
+
model = word_data.get('model', None)
|
| 5002 |
+
is_replaced = model and model != "Tesseract"
|
| 5003 |
+
|
| 5004 |
+
# Determine bounding box color: grey for replaced words, otherwise based on confidence
|
| 5005 |
+
# if is_replaced:
|
| 5006 |
+
# box_color = (128, 128, 128) # Grey for model replacements (bounding box only)
|
| 5007 |
+
# else:
|
| 5008 |
+
box_color = (0, 0, 255) # Default to red
|
| 5009 |
+
for min_conf, max_conf, conf_color, _ in confidence_ranges:
|
| 5010 |
+
if min_conf <= conf <= max_conf:
|
| 5011 |
+
box_color = conf_color
|
| 5012 |
+
break
|
| 5013 |
+
|
| 5014 |
+
# Draw bounding box
|
| 5015 |
+
cv2.rectangle(image_cv, (x1, y1), (x2, y2), box_color, 1)
|
| 5016 |
+
|
| 5017 |
+
# Add legend
|
| 5018 |
+
if add_legend:
|
| 5019 |
+
add_confidence_legend(image_cv, confidence_ranges, show_model_replacement=True)
|
| 5020 |
+
|
| 5021 |
+
# Create second page with text overlay
|
| 5022 |
+
text_page = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background
|
| 5023 |
+
|
| 5024 |
+
# Process each line's words for text overlay
|
| 5025 |
+
for line_key, line_data in ocr_results.items():
|
| 5026 |
+
if not isinstance(line_data, dict) or 'words' not in line_data:
|
| 5027 |
+
continue
|
| 5028 |
+
|
| 5029 |
+
words = line_data.get('words', [])
|
| 5030 |
+
|
| 5031 |
+
# Process each word in the line
|
| 5032 |
+
for word_data in words:
|
| 5033 |
+
if not isinstance(word_data, dict):
|
| 5034 |
+
continue
|
| 5035 |
+
|
| 5036 |
+
text = word_data.get('text', '')
|
| 5037 |
+
# Handle both 'conf' and 'confidence' field names for compatibility
|
| 5038 |
+
conf = int(word_data.get('conf', word_data.get('confidence', 0)))
|
| 5039 |
+
|
| 5040 |
+
# Skip empty text or invalid confidence
|
| 5041 |
+
if not text.strip() or conf == -1:
|
| 5042 |
+
continue
|
| 5043 |
+
|
| 5044 |
+
# Get bounding box coordinates
|
| 5045 |
+
bbox = word_data.get('bounding_box', (0, 0, 0, 0))
|
| 5046 |
+
if len(bbox) != 4:
|
| 5047 |
+
continue
|
| 5048 |
+
|
| 5049 |
+
x1, y1, x2, y2 = bbox
|
| 5050 |
+
|
| 5051 |
+
# Ensure coordinates are within image bounds
|
| 5052 |
+
x1 = max(0, min(int(x1), width))
|
| 5053 |
+
y1 = max(0, min(int(y1), height))
|
| 5054 |
+
x2 = max(0, min(int(x2), width))
|
| 5055 |
+
y2 = max(0, min(int(y2), height))
|
| 5056 |
+
|
| 5057 |
+
# Skip if bounding box is invalid
|
| 5058 |
+
if x2 <= x1 or y2 <= y1:
|
| 5059 |
+
continue
|
| 5060 |
+
|
| 5061 |
+
# Check if word was replaced by a different model (for reference, but text color always uses confidence)
|
| 5062 |
+
model = word_data.get('model', None)
|
| 5063 |
+
is_replaced = model and model != "Tesseract"
|
| 5064 |
+
|
| 5065 |
+
# Text color always based on confidence (not affected by model replacement)
|
| 5066 |
+
text_color = (0, 0, 180) # Default to dark red
|
| 5067 |
+
for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
|
| 5068 |
+
if min_conf <= conf <= max_conf:
|
| 5069 |
+
text_color = conf_color
|
| 5070 |
+
break
|
| 5071 |
+
|
| 5072 |
+
# Calculate font size to fit text within bounding box
|
| 5073 |
+
box_width = x2 - x1
|
| 5074 |
+
box_height = y2 - y1
|
| 5075 |
+
|
| 5076 |
+
# Start with a reasonable font scale
|
| 5077 |
+
font_scale = 0.5
|
| 5078 |
+
font_thickness = 1
|
| 5079 |
+
font = cv2.FONT_HERSHEY_SIMPLEX
|
| 5080 |
+
|
| 5081 |
+
# Get text size and adjust to fit
|
| 5082 |
+
(text_width, text_height), baseline = cv2.getTextSize(
|
| 5083 |
+
text, font, font_scale, font_thickness
|
| 5084 |
+
)
|
| 5085 |
+
|
| 5086 |
+
# Scale font to fit width (with some padding)
|
| 5087 |
+
if text_width > 0:
|
| 5088 |
+
width_scale = (box_width * 0.9) / text_width
|
| 5089 |
+
else:
|
| 5090 |
+
width_scale = 1.0
|
| 5091 |
+
|
| 5092 |
+
# Scale font to fit height (with some padding)
|
| 5093 |
+
if text_height > 0:
|
| 5094 |
+
height_scale = (box_height * 0.8) / text_height
|
| 5095 |
+
else:
|
| 5096 |
+
height_scale = 1.0
|
| 5097 |
+
|
| 5098 |
+
# Use the smaller scale to ensure text fits both dimensions
|
| 5099 |
+
font_scale = min(font_scale * min(width_scale, height_scale), 2.0) # Cap at 2.0
|
| 5100 |
+
|
| 5101 |
+
# Recalculate text size with adjusted font scale
|
| 5102 |
+
(text_width, text_height), baseline = cv2.getTextSize(
|
| 5103 |
+
text, font, font_scale, font_thickness
|
| 5104 |
+
)
|
| 5105 |
+
|
| 5106 |
+
# Center text within bounding box
|
| 5107 |
+
text_x = x1 + (box_width - text_width) // 2
|
| 5108 |
+
text_y = y1 + (box_height + text_height) // 2 # Baseline adjustment
|
| 5109 |
+
|
| 5110 |
+
# Draw text
|
| 5111 |
+
cv2.putText(
|
| 5112 |
+
text_page,
|
| 5113 |
+
text,
|
| 5114 |
+
(text_x, text_y),
|
| 5115 |
+
font,
|
| 5116 |
+
font_scale,
|
| 5117 |
+
text_color,
|
| 5118 |
+
font_thickness,
|
| 5119 |
+
cv2.LINE_AA
|
| 5120 |
+
)
|
| 5121 |
+
|
| 5122 |
+
# Draw grey bounding box for replaced words on text page
|
| 5123 |
+
if is_replaced:
|
| 5124 |
+
box_color = (128, 128, 128) # Grey for model replacements
|
| 5125 |
+
cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
|
| 5126 |
+
|
| 5127 |
+
# Add legend to second page
|
| 5128 |
+
if add_legend:
|
| 5129 |
+
add_confidence_legend(text_page, text_confidence_ranges, show_model_replacement=True)
|
| 5130 |
+
|
| 5131 |
+
# Concatenate images horizontally
|
| 5132 |
+
combined_image = np.hstack([image_cv, text_page])
|
| 5133 |
+
|
| 5134 |
+
# Save the visualization
|
| 5135 |
+
if output_folder:
|
| 5136 |
+
textract_viz_folder = os.path.join(output_folder, visualisation_folder)
|
| 5137 |
+
|
| 5138 |
+
# Double-check the constructed path is safe
|
| 5139 |
+
if not validate_folder_containment(textract_viz_folder, OUTPUT_FOLDER):
|
| 5140 |
+
raise ValueError(
|
| 5141 |
+
f"Unsafe textract visualisations folder path: {textract_viz_folder}"
|
| 5142 |
+
)
|
| 5143 |
+
|
| 5144 |
+
os.makedirs(textract_viz_folder, exist_ok=True)
|
| 5145 |
+
|
| 5146 |
+
# Generate filename
|
| 5147 |
+
if image_name:
|
| 5148 |
+
# Remove file extension if present
|
| 5149 |
+
base_name = os.path.splitext(image_name)[0]
|
| 5150 |
+
filename = f"{base_name}_{visualisation_folder}.jpg"
|
| 5151 |
+
else:
|
| 5152 |
+
timestamp = int(time.time())
|
| 5153 |
+
filename = f"{visualisation_folder}_{timestamp}.jpg"
|
| 5154 |
+
|
| 5155 |
+
output_path = os.path.join(textract_viz_folder, filename)
|
| 5156 |
+
|
| 5157 |
+
# Save the combined image
|
| 5158 |
+
cv2.imwrite(output_path, combined_image)
|
| 5159 |
+
print(f"OCR visualization saved to: {output_path}")
|
| 5160 |
+
|
| 5161 |
+
log_files_output_paths.append(output_path)
|
| 5162 |
+
|
| 5163 |
+
return log_files_output_paths
|
| 5164 |
+
|
| 5165 |
+
|
| 5166 |
+
def add_confidence_legend(
|
| 5167 |
+
image_cv: np.ndarray,
|
| 5168 |
+
confidence_ranges: List[Tuple],
|
| 5169 |
+
show_model_replacement: bool = False
|
| 5170 |
+
) -> None:
|
| 5171 |
+
"""
|
| 5172 |
+
Adds a confidence legend to the visualization image.
|
| 5173 |
+
|
| 5174 |
+
Args:
|
| 5175 |
+
image_cv: OpenCV image array
|
| 5176 |
+
confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
|
| 5177 |
+
show_model_replacement: Whether to include a legend entry for model replacements (grey)
|
| 5178 |
+
"""
|
| 5179 |
+
height, width = image_cv.shape[:2]
|
| 5180 |
+
|
| 5181 |
+
# Calculate legend height based on number of items
|
| 5182 |
+
num_items = len(confidence_ranges)
|
| 5183 |
+
if show_model_replacement:
|
| 5184 |
+
num_items += 1 # Add one more for model replacement entry
|
| 5185 |
+
|
| 5186 |
+
# Legend parameters
|
| 5187 |
+
legend_width = 200
|
| 5188 |
+
legend_height = 80 + (num_items * 25) # Dynamic height based on number of items
|
| 5189 |
+
legend_x = width - legend_width - 20
|
| 5190 |
+
legend_y = 20
|
| 5191 |
+
|
| 5192 |
+
# Draw legend background
|
| 5193 |
+
cv2.rectangle(
|
| 5194 |
+
image_cv,
|
| 5195 |
+
(legend_x, legend_y),
|
| 5196 |
+
(legend_x + legend_width, legend_y + legend_height),
|
| 5197 |
+
(255, 255, 255), # White background
|
| 5198 |
+
-1,
|
| 5199 |
+
)
|
| 5200 |
+
cv2.rectangle(
|
| 5201 |
+
image_cv,
|
| 5202 |
+
(legend_x, legend_y),
|
| 5203 |
+
(legend_x + legend_width, legend_y + legend_height),
|
| 5204 |
+
(0, 0, 0), # Black border
|
| 5205 |
+
2,
|
| 5206 |
+
)
|
| 5207 |
+
|
| 5208 |
+
# Add title
|
| 5209 |
+
title_text = "Confidence Levels"
|
| 5210 |
+
font_scale = 0.6
|
| 5211 |
+
font_thickness = 2
|
| 5212 |
+
(title_width, title_height), _ = cv2.getTextSize(
|
| 5213 |
+
title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
|
| 5214 |
+
)
|
| 5215 |
+
title_x = legend_x + (legend_width - title_width) // 2
|
| 5216 |
+
title_y = legend_y + title_height + 10
|
| 5217 |
+
cv2.putText(
|
| 5218 |
+
image_cv,
|
| 5219 |
+
title_text,
|
| 5220 |
+
(title_x, title_y),
|
| 5221 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 5222 |
+
font_scale,
|
| 5223 |
+
(0, 0, 0), # Black text
|
| 5224 |
+
font_thickness,
|
| 5225 |
+
)
|
| 5226 |
+
|
| 5227 |
+
# Add confidence range items
|
| 5228 |
+
item_spacing = 25
|
| 5229 |
+
start_y = title_y + 25
|
| 5230 |
+
item_index = 0
|
| 5231 |
+
|
| 5232 |
+
# Add model replacement entry first if enabled
|
| 5233 |
+
if show_model_replacement:
|
| 5234 |
+
item_y = start_y + item_index * item_spacing
|
| 5235 |
+
item_index += 1
|
| 5236 |
+
|
| 5237 |
+
# Draw grey color box
|
| 5238 |
+
box_size = 15
|
| 5239 |
+
box_x = legend_x + 10
|
| 5240 |
+
box_y = item_y - box_size
|
| 5241 |
+
replacement_color = (128, 128, 128) # Grey in BGR
|
| 5242 |
+
cv2.rectangle(
|
| 5243 |
+
image_cv, (box_x, box_y), (box_x + box_size, box_y + box_size), replacement_color, -1
|
| 5244 |
+
)
|
| 5245 |
+
cv2.rectangle(
|
| 5246 |
+
image_cv,
|
| 5247 |
+
(box_x, box_y),
|
| 5248 |
+
(box_x + box_size, box_y + box_size),
|
| 5249 |
+
(0, 0, 0), # Black border
|
| 5250 |
+
1,
|
| 5251 |
+
)
|
| 5252 |
+
|
| 5253 |
+
# Add label text
|
| 5254 |
+
label_x = box_x + box_size + 10
|
| 5255 |
+
label_y = item_y - 5
|
| 5256 |
+
cv2.putText(
|
| 5257 |
+
image_cv,
|
| 5258 |
+
"Model Replacement",
|
| 5259 |
+
(label_x, label_y),
|
| 5260 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 5261 |
+
0.5,
|
| 5262 |
+
(0, 0, 0), # Black text
|
| 5263 |
+
1,
|
| 5264 |
+
)
|
| 5265 |
+
|
| 5266 |
+
# Add confidence range items
|
| 5267 |
+
for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
|
| 5268 |
+
item_y = start_y + (item_index + i) * item_spacing
|
| 5269 |
+
|
| 5270 |
+
# Draw color box
|
| 5271 |
+
box_size = 15
|
| 5272 |
+
box_x = legend_x + 10
|
| 5273 |
+
box_y = item_y - box_size
|
| 5274 |
+
cv2.rectangle(
|
| 5275 |
+
image_cv, (box_x, box_y), (box_x + box_size, box_y + box_size), color, -1
|
| 5276 |
+
)
|
| 5277 |
+
cv2.rectangle(
|
| 5278 |
+
image_cv,
|
| 5279 |
+
(box_x, box_y),
|
| 5280 |
+
(box_x + box_size, box_y + box_size),
|
| 5281 |
+
(0, 0, 0), # Black border
|
| 5282 |
+
1,
|
| 5283 |
+
)
|
| 5284 |
+
|
| 5285 |
+
# Add label text
|
| 5286 |
+
label_x = box_x + box_size + 10
|
| 5287 |
+
label_y = item_y - 5
|
| 5288 |
+
cv2.putText(
|
| 5289 |
+
image_cv,
|
| 5290 |
+
label,
|
| 5291 |
+
(label_x, label_y),
|
| 5292 |
+
cv2.FONT_HERSHEY_SIMPLEX,
|
| 5293 |
+
0.5,
|
| 5294 |
+
(0, 0, 0), # Black text
|
| 5295 |
+
1,
|
| 5296 |
+
)
|
| 5297 |
+
|
tools/run_vlm.py
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sys
|
| 3 |
+
import time
|
| 4 |
+
from threading import Thread
|
| 5 |
+
|
| 6 |
+
import spaces
|
| 7 |
+
from PIL import Image
|
| 8 |
+
|
| 9 |
+
from tools.config import SHOW_VLM_MODEL_OPTIONS, MAX_SPACES_GPU_RUN_TIME
|
| 10 |
+
|
| 11 |
+
if SHOW_VLM_MODEL_OPTIONS is True:
|
| 12 |
+
import torch
|
| 13 |
+
from huggingface_hub import snapshot_download
|
| 14 |
+
from transformers import (
|
| 15 |
+
AutoModelForCausalLM,
|
| 16 |
+
AutoProcessor,
|
| 17 |
+
Qwen2_5_VLForConditionalGeneration,
|
| 18 |
+
Qwen3VLForConditionalGeneration,
|
| 19 |
+
TextIteratorStreamer,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
from tools.config import (
|
| 23 |
+
SELECTED_MODEL,
|
| 24 |
+
USE_FLASH_ATTENTION,
|
| 25 |
+
MODEL_CACHE_PATH,
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
# Configuration: Choose which vision model to load
|
| 29 |
+
# Options: "olmOCR-2-7B-1025", "Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR"
|
| 30 |
+
# SELECTED_MODEL = os.getenv("VISION_MODEL", "Dots.OCR")
|
| 31 |
+
|
| 32 |
+
# This code is uses significant amounts of code from the Hugging Face space here: https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR3 . Thanks!
|
| 33 |
+
|
| 34 |
+
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
|
| 35 |
+
|
| 36 |
+
print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
|
| 37 |
+
print("torch.__version__ =", torch.__version__)
|
| 38 |
+
print("torch.version.cuda =", torch.version.cuda)
|
| 39 |
+
print("cuda available:", torch.cuda.is_available())
|
| 40 |
+
print("cuda device count:", torch.cuda.device_count())
|
| 41 |
+
if torch.cuda.is_available():
|
| 42 |
+
print("current device:", torch.cuda.current_device())
|
| 43 |
+
print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
|
| 44 |
+
|
| 45 |
+
print("Using device:", device)
|
| 46 |
+
|
| 47 |
+
CACHE_PATH = MODEL_CACHE_PATH
|
| 48 |
+
if not os.path.exists(CACHE_PATH):
|
| 49 |
+
os.makedirs(CACHE_PATH)
|
| 50 |
+
|
| 51 |
+
# Initialize model and processor variables
|
| 52 |
+
processor = None
|
| 53 |
+
model = None
|
| 54 |
+
|
| 55 |
+
print(f"Loading vision model: {SELECTED_MODEL}")
|
| 56 |
+
|
| 57 |
+
# Load only the selected model based on configuration
|
| 58 |
+
if SELECTED_MODEL == "olmOCR-2-7B-1025":
|
| 59 |
+
MODEL_ID = "allenai/olmOCR-2-7B-1025"
|
| 60 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 61 |
+
model = (
|
| 62 |
+
Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 63 |
+
MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
|
| 64 |
+
)
|
| 65 |
+
.to(device)
|
| 66 |
+
.eval()
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
elif SELECTED_MODEL == "Nanonets-OCR2-3B":
|
| 70 |
+
MODEL_ID = "nanonets/Nanonets-OCR2-3B"
|
| 71 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 72 |
+
model = (
|
| 73 |
+
Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 74 |
+
MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
|
| 75 |
+
)
|
| 76 |
+
.to(device)
|
| 77 |
+
.eval()
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
elif SELECTED_MODEL == "Chandra-OCR":
|
| 81 |
+
MODEL_ID = "datalab-to/chandra"
|
| 82 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 83 |
+
model = (
|
| 84 |
+
Qwen3VLForConditionalGeneration.from_pretrained(
|
| 85 |
+
MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
|
| 86 |
+
)
|
| 87 |
+
.to(device)
|
| 88 |
+
.eval()
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
elif SELECTED_MODEL == "Dots.OCR":
|
| 92 |
+
# Download and patch Dots.OCR model
|
| 93 |
+
model_path_d_local = snapshot_download(
|
| 94 |
+
repo_id="rednote-hilab/dots.ocr",
|
| 95 |
+
local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
|
| 96 |
+
max_workers=20,
|
| 97 |
+
local_dir_use_symlinks=False,
|
| 98 |
+
)
|
| 99 |
+
|
| 100 |
+
config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
|
| 101 |
+
|
| 102 |
+
if os.path.exists(config_file_path):
|
| 103 |
+
with open(config_file_path, "r") as f:
|
| 104 |
+
input_code = f.read()
|
| 105 |
+
|
| 106 |
+
lines = input_code.splitlines()
|
| 107 |
+
if "class DotsVLProcessor" in input_code and not any(
|
| 108 |
+
"attributes = " in line for line in lines
|
| 109 |
+
):
|
| 110 |
+
output_lines = []
|
| 111 |
+
for line in lines:
|
| 112 |
+
output_lines.append(line)
|
| 113 |
+
if line.strip().startswith("class DotsVLProcessor"):
|
| 114 |
+
output_lines.append(
|
| 115 |
+
' attributes = ["image_processor", "tokenizer"]'
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
with open(config_file_path, "w") as f:
|
| 119 |
+
f.write("\n".join(output_lines))
|
| 120 |
+
print("Patched configuration_dots.py successfully.")
|
| 121 |
+
|
| 122 |
+
sys.path.append(model_path_d_local)
|
| 123 |
+
|
| 124 |
+
if USE_FLASH_ATTENTION is True:
|
| 125 |
+
attn_implementation = "flash_attention_2"
|
| 126 |
+
else:
|
| 127 |
+
attn_implementation = "eager"
|
| 128 |
+
|
| 129 |
+
MODEL_ID = model_path_d_local
|
| 130 |
+
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
|
| 131 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 132 |
+
MODEL_ID,
|
| 133 |
+
attn_implementation=attn_implementation,
|
| 134 |
+
torch_dtype=torch.bfloat16,
|
| 135 |
+
device_map="auto",
|
| 136 |
+
trust_remote_code=True,
|
| 137 |
+
).eval()
|
| 138 |
+
|
| 139 |
+
else:
|
| 140 |
+
raise ValueError(
|
| 141 |
+
f"Invalid model selected: {SELECTED_MODEL}. Valid options are: olmOCR-2-7B-1025, Nanonets-OCR2-3B, Chandra-OCR, Dots.OCR"
|
| 142 |
+
)
|
| 143 |
+
|
| 144 |
+
print(f"Successfully loaded {SELECTED_MODEL}")
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
|
| 148 |
+
def generate_image(
|
| 149 |
+
text: str,
|
| 150 |
+
image: Image.Image,
|
| 151 |
+
max_new_tokens: int,
|
| 152 |
+
temperature: float,
|
| 153 |
+
top_p: float,
|
| 154 |
+
top_k: int,
|
| 155 |
+
repetition_penalty: float,
|
| 156 |
+
):
|
| 157 |
+
"""
|
| 158 |
+
Generates responses using the configured vision model for image input.
|
| 159 |
+
Streams text to console and returns complete text only at the end.
|
| 160 |
+
"""
|
| 161 |
+
if image is None:
|
| 162 |
+
return "Please upload an image."
|
| 163 |
+
|
| 164 |
+
messages = [
|
| 165 |
+
{
|
| 166 |
+
"role": "user",
|
| 167 |
+
"content": [
|
| 168 |
+
{"type": "image"},
|
| 169 |
+
{"type": "text", "text": text},
|
| 170 |
+
],
|
| 171 |
+
}
|
| 172 |
+
]
|
| 173 |
+
prompt_full = processor.apply_chat_template(
|
| 174 |
+
messages, tokenize=False, add_generation_prompt=True
|
| 175 |
+
)
|
| 176 |
+
|
| 177 |
+
inputs = processor(
|
| 178 |
+
text=[prompt_full], images=[image], return_tensors="pt", padding=True
|
| 179 |
+
).to(device)
|
| 180 |
+
|
| 181 |
+
streamer = TextIteratorStreamer(
|
| 182 |
+
processor, skip_prompt=True, skip_special_tokens=True
|
| 183 |
+
)
|
| 184 |
+
generation_kwargs = {
|
| 185 |
+
**inputs,
|
| 186 |
+
"streamer": streamer,
|
| 187 |
+
"max_new_tokens": max_new_tokens,
|
| 188 |
+
"do_sample": True,
|
| 189 |
+
"temperature": temperature,
|
| 190 |
+
"top_p": top_p,
|
| 191 |
+
"top_k": top_k,
|
| 192 |
+
"repetition_penalty": repetition_penalty,
|
| 193 |
+
}
|
| 194 |
+
thread = Thread(target=model.generate, kwargs=generation_kwargs)
|
| 195 |
+
thread.start()
|
| 196 |
+
|
| 197 |
+
buffer = ""
|
| 198 |
+
for new_text in streamer:
|
| 199 |
+
buffer += new_text
|
| 200 |
+
buffer = buffer.replace("<|im_end|>", "")
|
| 201 |
+
|
| 202 |
+
# Print to console as it streams
|
| 203 |
+
print(new_text, end="", flush=True)
|
| 204 |
+
|
| 205 |
+
time.sleep(0.01)
|
| 206 |
+
|
| 207 |
+
# Print final newline after streaming is complete
|
| 208 |
+
print() # Add newline at the end
|
| 209 |
+
|
| 210 |
+
# Return the complete text only at the end
|
| 211 |
+
return buffer
|