seanpedrickcase commited on
Commit
5e01004
·
1 Parent(s): 2f34683

Initial commit for VLM support. Created visualisations for OCR output. Corrected log_file_output_paths reference.

Browse files
.dockerignore CHANGED
@@ -34,3 +34,4 @@ test/output/*
34
  test/tmp/*
35
  test/usage/*
36
  .ruff_cache/*
 
 
34
  test/tmp/*
35
  test/usage/*
36
  .ruff_cache/*
37
+ model_cache/*
.gitignore CHANGED
@@ -37,3 +37,4 @@ test/output/*
37
  test/tmp/*
38
  test/usage/*
39
  .ruff_cache/*
 
 
37
  test/tmp/*
38
  test/usage/*
39
  .ruff_cache/*
40
+ model_cache/*
Dockerfile CHANGED
@@ -16,11 +16,11 @@ RUN apt-get update \
16
 
17
  WORKDIR /src
18
 
19
- COPY requirements.txt .
20
 
21
- RUN pip install --verbose --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
22
 
23
- # Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. See requirements.txt for more details, including installing the GPU version of PaddleOCR.
24
  ARG INSTALL_PADDLEOCR=False
25
  ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
26
 
@@ -28,6 +28,11 @@ RUN if [ "$INSTALL_PADDLEOCR" = "True" ]; then \
28
  pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0 paddlepaddle==3.2.0; \
29
  fi
30
 
 
 
 
 
 
31
  # ===================================================================
32
  # Stage 2: A common 'base' for both Lambda and Gradio
33
  # ===================================================================
 
16
 
17
  WORKDIR /src
18
 
19
+ COPY requirements_lightweight.txt .
20
 
21
+ RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
22
 
23
+ # Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. See requirements_lightweight.txt for more details, including installing the GPU version of PaddleOCR.
24
  ARG INSTALL_PADDLEOCR=False
25
  ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
26
 
 
28
  pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0 paddlepaddle==3.2.0; \
29
  fi
30
 
31
+ RUN if [ "$INSTALL_VLM" = "True" ]; then \
32
+ pip install --verbose --no-cache-dir --target=/install torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/cu126; \
33
+ pip install --verbose --no-cache-dir --target=/install transformers==4.57.1 accelerate==1.11.0 bitsandbytes==0.48.1; \
34
+ fi
35
+
36
  # ===================================================================
37
  # Stage 2: A common 'base' for both Lambda and Gradio
38
  # ===================================================================
README.md CHANGED
@@ -162,7 +162,7 @@ These settings are useful for all users, regardless of whether you are using AWS
162
  * Set to `True` to display a language selection dropdown in the UI for OCR processing.
163
 
164
  * `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
165
- * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
166
 
167
  * `SESSION_OUTPUT_FOLDER=False`
168
  * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
@@ -922,7 +922,7 @@ The hybrid OCR mode uses several configurable parameters:
922
 
923
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
924
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
925
- - **SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES** (default: False): Save comparison images when using hybrid mode
926
  - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
927
 
928
  ### When to use different OCR models
 
162
  * Set to `True` to display a language selection dropdown in the UI for OCR processing.
163
 
164
  * `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
165
+ * Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
166
 
167
  * `SESSION_OUTPUT_FOLDER=False`
168
  * If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
 
922
 
923
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
924
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
925
+ - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
926
  - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
927
 
928
  ### When to use different OCR models
app.py CHANGED
@@ -1242,16 +1242,7 @@ with blocks:
1242
  label=f"Change default redaction settings.{default_text}{textract_text}{comprehend_text}{open_tab_text}".strip(),
1243
  open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
1244
  ):
1245
- text_extract_method_radio.render()
1246
-
1247
- if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
1248
- with gr.Accordion(
1249
- "Enable AWS Textract signature detection (default is off)",
1250
- open=False,
1251
- ):
1252
- handwrite_signature_checkbox.render()
1253
- else:
1254
- handwrite_signature_checkbox.render()
1255
 
1256
  if SHOW_LOCAL_OCR_MODEL_OPTIONS:
1257
  with gr.Accordion(
@@ -1259,7 +1250,7 @@ with blocks:
1259
  open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
1260
  ):
1261
  local_ocr_method_radio = gr.Radio(
1262
- label="""Choose local OCR model. "tesseract" is the default and will work for most documents. "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.""",
1263
  value=CHOSEN_LOCAL_OCR_MODEL,
1264
  choices=LOCAL_OCR_MODEL_OPTIONS,
1265
  interactive=True,
@@ -1274,6 +1265,15 @@ with blocks:
1274
  visible=False,
1275
  )
1276
 
 
 
 
 
 
 
 
 
 
1277
  with gr.Row(equal_height=True):
1278
  pii_identification_method_drop.render()
1279
 
@@ -1378,16 +1378,20 @@ with blocks:
1378
  with gr.Row(equal_height=False):
1379
  with gr.Column(scale=2):
1380
  textract_job_detail_df = gr.Dataframe(
1381
- label="Previous job details",
1382
- visible=True,
1383
- type="pandas",
1384
- wrap=True,
1385
- interactive=True,
1386
- row_count=(0, "fixed"),
1387
- col_count=(5, "fixed"),
1388
- static_columns=[0, 1, 2, 3, 4],
1389
- max_height=400,
1390
- )
 
 
 
 
1391
  with gr.Column(scale=1):
1392
  job_id_textbox = gr.Textbox(
1393
  label="Job ID to check status",
 
1242
  label=f"Change default redaction settings.{default_text}{textract_text}{comprehend_text}{open_tab_text}".strip(),
1243
  open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
1244
  ):
1245
+ text_extract_method_radio.render()
 
 
 
 
 
 
 
 
 
1246
 
1247
  if SHOW_LOCAL_OCR_MODEL_OPTIONS:
1248
  with gr.Accordion(
 
1250
  open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
1251
  ):
1252
  local_ocr_method_radio = gr.Radio(
1253
+ label="""Choose local OCR model. "tesseract" is the default and will work for most documents. "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence. "hybrid-vlm" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen vision model (default Dots.OCR) on words with low confidence.""",
1254
  value=CHOSEN_LOCAL_OCR_MODEL,
1255
  choices=LOCAL_OCR_MODEL_OPTIONS,
1256
  interactive=True,
 
1265
  visible=False,
1266
  )
1267
 
1268
+ if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
1269
+ with gr.Accordion(
1270
+ "Enable AWS Textract signature detection (default is off)",
1271
+ open=False,
1272
+ ):
1273
+ handwrite_signature_checkbox.render()
1274
+ else:
1275
+ handwrite_signature_checkbox.render()
1276
+
1277
  with gr.Row(equal_height=True):
1278
  pii_identification_method_drop.render()
1279
 
 
1378
  with gr.Row(equal_height=False):
1379
  with gr.Column(scale=2):
1380
  textract_job_detail_df = gr.Dataframe(
1381
+ pd.DataFrame(
1382
+ columns=[
1383
+ "job_id",
1384
+ "file_name",
1385
+ "job_type",
1386
+ "signature_extraction",
1387
+ "job_date_time",
1388
+ ]
1389
+ ),
1390
+ label="Previous job details",
1391
+ visible=True,
1392
+ type="pandas",
1393
+ wrap=True,
1394
+ )
1395
  with gr.Column(scale=1):
1396
  job_id_textbox = gr.Textbox(
1397
  label="Job ID to check status",
cli_redact.py CHANGED
@@ -399,7 +399,7 @@ python cli_redact.py --task textract --textract_action list
399
  )
400
  pdf_group.add_argument(
401
  "--chosen_local_ocr_model",
402
- choices=["tesseract", "hybrid", "paddle"],
403
  default=CHOSEN_LOCAL_OCR_MODEL,
404
  help="Local OCR model to use.",
405
  )
 
399
  )
400
  pdf_group.add_argument(
401
  "--chosen_local_ocr_model",
402
+ choices=["tesseract", "hybrid-paddle", "paddle"],
403
  default=CHOSEN_LOCAL_OCR_MODEL,
404
  help="Local OCR model to use.",
405
  )
requirements.txt CHANGED
@@ -24,13 +24,21 @@ python-dotenv==1.0.1
24
  awslambdaric==3.1.1
25
  python-docx==1.2.0
26
  defusedxml==0.7.1
27
- # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
28
- # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
29
- # paddleocr==3.3.0
30
-
31
  # Test dependencies
32
  pytest>=7.0.0
33
  pytest-cov>=4.0.0
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
 
36
 
 
24
  awslambdaric==3.1.1
25
  python-docx==1.2.0
26
  defusedxml==0.7.1
 
 
 
 
27
  # Test dependencies
28
  pytest>=7.0.0
29
  pytest-cov>=4.0.0
30
+ spaces==0.42.1
31
+ # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
32
+ # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
33
+ # paddleocr==3.3.0
34
+ For running VLMs
35
+ torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/cu126
36
+ transformers==4.57.1
37
+ accelerate==1.11.0
38
+ bitsandbytes==0.48.1
39
+ flash-attn==2.8.3 # Only compatible with Linux systems
40
+
41
+
42
 
43
 
44
 
requirements_lightweight.txt ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pdfminer.six==20250506
2
+ pdf2image==1.17.0
3
+ pymupdf==1.26.4
4
+ opencv-python==4.12.0.88
5
+ presidio_analyzer==2.2.360
6
+ presidio_anonymizer==2.2.360
7
+ presidio-image-redactor==0.0.57
8
+ pikepdf==9.11.0
9
+ pandas==2.3.3
10
+ scikit-learn==1.7.2
11
+ spacy==3.8.7
12
+ en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
13
+ gradio==5.49.1
14
+ polars==1.33.1
15
+ boto3==1.40.57
16
+ pyarrow==21.0.0
17
+ openpyxl==3.1.5
18
+ Faker==37.8.0
19
+ python-levenshtein==0.27.1
20
+ spaczz==0.6.1
21
+ https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
22
+ rapidfuzz==3.14.1
23
+ python-dotenv==1.0.1
24
+ awslambdaric==3.1.1
25
+ python-docx==1.2.0
26
+ defusedxml==0.7.1
27
+ # Test dependencies
28
+ pytest>=7.0.0
29
+ pytest-cov>=4.0.0
30
+ spaces==0.42.1
31
+ # Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
32
+ # paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
33
+ # paddleocr==3.3.0
34
+
35
+
36
+
37
+
38
+
src/app_settings.qmd CHANGED
@@ -300,7 +300,7 @@ Configurations related to text extraction, PII detection, and the redaction proc
300
  ### Local OCR (Tesseract & PaddleOCR)
301
 
302
  * **`CHOSEN_LOCAL_OCR_MODEL`**
303
- * **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid"`.
304
  * **Default Value:** `"tesseract"`
305
 
306
  * **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
@@ -308,11 +308,11 @@ Configurations related to text extraction, PII detection, and the redaction proc
308
  * **Default Value:** `"False"`
309
 
310
  * **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
311
- * **Description:** In "hybrid" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
312
  * **Default Value:** `65`
313
 
314
  * **`HYBRID_OCR_PADDING`**
315
- * **Description:** In "hybrid" mode, padding added to the word's bounding box before re-extraction.
316
  * **Default Value:** `1`
317
 
318
  * **`PADDLE_USE_TEXTLINE_ORIENTATION`**
@@ -323,8 +323,8 @@ Configurations related to text extraction, PII detection, and the redaction proc
323
  * **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
324
  * **Default Value:** `1.2`
325
 
326
- * **`SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES`**
327
- * **Description:** Saves comparison images when using "hybrid" OCR mode.
328
  * **Default Value:** `"False"`
329
 
330
  * **`SAVE_PADDLE_VISUALISATIONS`**
 
300
  ### Local OCR (Tesseract & PaddleOCR)
301
 
302
  * **`CHOSEN_LOCAL_OCR_MODEL`**
303
+ * **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid-paddle"`.
304
  * **Default Value:** `"tesseract"`
305
 
306
  * **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
 
308
  * **Default Value:** `"False"`
309
 
310
  * **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
311
+ * **Description:** In "hybrid-paddle" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
312
  * **Default Value:** `65`
313
 
314
  * **`HYBRID_OCR_PADDING`**
315
+ * **Description:** In "hybrid-paddle" mode, padding added to the word's bounding box before re-extraction.
316
  * **Default Value:** `1`
317
 
318
  * **`PADDLE_USE_TEXTLINE_ORIENTATION`**
 
323
  * **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
324
  * **Default Value:** `1.2`
325
 
326
+ * **`SAVE_EXAMPLE_HYBRID_IMAGES`**
327
+ * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
328
  * **Default Value:** `"False"`
329
 
330
  * **`SAVE_PADDLE_VISUALISATIONS`**
src/user_guide.qmd CHANGED
@@ -721,7 +721,7 @@ The hybrid OCR mode uses several configurable parameters:
721
 
722
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
723
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
724
- - **SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES** (default: False): Save comparison images when using hybrid mode
725
  - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
726
 
727
  ### When to use different OCR models
 
721
 
722
  - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
723
  - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
724
+ - **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
725
  - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
726
 
727
  ### When to use different OCR models
tools/config.py CHANGED
@@ -437,10 +437,54 @@ DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var(
437
  "DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
438
  )
439
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
440
  ### Local OCR model - Tesseract vs PaddleOCR
441
  CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
442
  "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
443
- ) # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
444
 
445
  SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
446
  get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
@@ -448,12 +492,19 @@ SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
448
  if SHOW_LOCAL_OCR_MODEL_OPTIONS:
449
  LOCAL_OCR_MODEL_OPTIONS = [
450
  "tesseract",
451
- "hybrid",
452
  "paddle",
453
  ]
454
  else:
455
  LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
456
 
 
 
 
 
 
 
 
457
  HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
458
  get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
459
  ) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
@@ -461,6 +512,14 @@ HYBRID_OCR_PADDING = int(
461
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
462
  ) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
463
 
 
 
 
 
 
 
 
 
464
  PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
465
  get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
466
  )
@@ -469,14 +528,22 @@ PADDLE_DET_DB_UNCLIP_RATIO = float(
469
  get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
470
  )
471
 
472
- SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = convert_string_to_boolean(
473
- get_or_create_env_var("SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES", "False")
474
  ) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
475
 
476
  SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
477
  get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
478
  ) # Whether to save visualisations of PaddleOCR bounding boxes.
479
 
 
 
 
 
 
 
 
 
480
  # Model storage paths for Lambda compatibility
481
  PADDLE_MODEL_PATH = get_or_create_env_var(
482
  "PADDLE_MODEL_PATH", ""
@@ -487,7 +554,7 @@ SPACY_MODEL_PATH = get_or_create_env_var(
487
  ) # Directory for spaCy model storage. Uses default location if not set.
488
 
489
  PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
490
- "PREPROCESS_LOCAL_OCR_IMAGES", "False"
491
  ) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
492
 
493
  # Entities for redaction
@@ -1012,6 +1079,7 @@ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
1012
  ) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
1013
 
1014
 
 
1015
  ###
1016
  # Config vars output format
1017
  ###
 
437
  "DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
438
  )
439
 
440
+ ###
441
+ # LOCAL OCR MODEL OPTIONS
442
+ ###
443
+
444
+
445
+ ### VLM OPTIONS
446
+
447
+ SHOW_VLM_MODEL_OPTIONS = convert_string_to_boolean(
448
+ get_or_create_env_var("SHOW_VLM_MODEL_OPTIONS", "False")
449
+ ) # Whether to show the VLM model options in the UI
450
+
451
+ SELECTED_MODEL = get_or_create_env_var(
452
+ "SELECTED_MODEL", "Dots.OCR"
453
+ ) # Selected vision model. Choose from: "olmOCR-2-7B-1025", "Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR"
454
+
455
+ if SHOW_VLM_MODEL_OPTIONS:
456
+ VLM_MODEL_OPTIONS = [
457
+ SELECTED_MODEL,
458
+ ]
459
+
460
+ MAX_SPACES_GPU_RUN_TIME = int(
461
+ get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "60")
462
+ ) # Maximum number of seconds to run the GPU on Spaces
463
+
464
+ MAX_NEW_TOKENS = int(
465
+ get_or_create_env_var("MAX_NEW_TOKENS", "30")
466
+ ) # Maximum number of tokens to generate
467
+
468
+ DEFAULT_MAX_NEW_TOKENS = int(
469
+ get_or_create_env_var("DEFAULT_MAX_NEW_TOKENS", "30")
470
+ ) # Default maximum number of tokens to generate
471
+
472
+ MAX_INPUT_TOKEN_LENGTH = int(
473
+ get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
474
+ ) # Maximum number of tokens to input to the VLM
475
+
476
+ USE_FLASH_ATTENTION = convert_string_to_boolean(
477
+ get_or_create_env_var("USE_FLASH_ATTENTION", "False")
478
+ ) # Whether to use flash attention for the VLM
479
+
480
+ OVERWRITE_EXISTING_OCR_RESULTS = convert_string_to_boolean(
481
+ get_or_create_env_var("OVERWRITE_EXISTING_OCR_RESULTS", "False")
482
+ ) # If True, always create new OCR results instead of loading from existing JSON files
483
+
484
  ### Local OCR model - Tesseract vs PaddleOCR
485
  CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
486
  "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
487
+ ) # Choose between "tesseract", "hybrid-paddle", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
488
 
489
  SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
490
  get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
 
492
  if SHOW_LOCAL_OCR_MODEL_OPTIONS:
493
  LOCAL_OCR_MODEL_OPTIONS = [
494
  "tesseract",
495
+ "hybrid-paddle",
496
  "paddle",
497
  ]
498
  else:
499
  LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
500
 
501
+ vlm_options = ["hybrid-vlm"]
502
+ if SHOW_VLM_MODEL_OPTIONS:
503
+ LOCAL_OCR_MODEL_OPTIONS.extend(vlm_options)
504
+
505
+ MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache")
506
+
507
+
508
  HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
509
  get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
510
  ) # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
 
512
  get_or_create_env_var("HYBRID_OCR_PADDING", "1")
513
  ) # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
514
 
515
+ TESSERACT_SEGMENTATION_LEVEL = get_or_create_env_var(
516
+ "TESSERACT_SEGMENTATION_LEVEL", "word"
517
+ ) # Tesseract segmentation level: "word" (PSM 11) or "line" (PSM 6)
518
+
519
+ CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean(
520
+ get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "False")
521
+ ) # Whether to convert line-level OCR results to word-level for better precision
522
+
523
  PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
524
  get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
525
  )
 
528
  get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
529
  )
530
 
531
+ SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean(
532
+ get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
533
  ) # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
534
 
535
  SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
536
  get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
537
  ) # Whether to save visualisations of PaddleOCR bounding boxes.
538
 
539
+ SAVE_TESSERACT_VISUALISATIONS = convert_string_to_boolean(
540
+ get_or_create_env_var("SAVE_TESSERACT_VISUALISATIONS", "False")
541
+ ) # Whether to save visualisations of Tesseract bounding boxes.
542
+
543
+ SAVE_TEXTRACT_VISUALISATIONS = convert_string_to_boolean(
544
+ get_or_create_env_var("SAVE_TEXTRACT_VISUALISATIONS", "False")
545
+ ) # Whether to save visualisations of AWS Textract bounding boxes.
546
+
547
  # Model storage paths for Lambda compatibility
548
  PADDLE_MODEL_PATH = get_or_create_env_var(
549
  "PADDLE_MODEL_PATH", ""
 
554
  ) # Directory for spaCy model storage. Uses default location if not set.
555
 
556
  PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
557
+ "PREPROCESS_LOCAL_OCR_IMAGES", "True"
558
  ) # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
559
 
560
  # Entities for redaction
 
1079
  ) # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
1080
 
1081
 
1082
+
1083
  ###
1084
  # Config vars output format
1085
  ###
tools/custom_image_analyser_engine.py CHANGED
@@ -17,22 +17,28 @@ from presidio_analyzer import AnalyzerEngine, RecognizerResult
17
 
18
  from tools.config import (
19
  AWS_PII_OPTION,
 
20
  DEFAULT_LANGUAGE,
21
  HYBRID_OCR_CONFIDENCE_THRESHOLD,
22
  HYBRID_OCR_PADDING,
23
  LOCAL_OCR_MODEL_OPTIONS,
24
  LOCAL_PII_OPTION,
 
25
  OUTPUT_FOLDER,
26
  PADDLE_DET_DB_UNCLIP_RATIO,
27
  PADDLE_MODEL_PATH,
28
  PADDLE_USE_TEXTLINE_ORIENTATION,
29
  PREPROCESS_LOCAL_OCR_IMAGES,
30
- SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES,
31
  SAVE_PADDLE_VISUALISATIONS,
 
 
 
32
  )
33
  from tools.helper_functions import clean_unicode_text
34
  from tools.load_spacy_model_custom_recognisers import custom_entities
35
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
 
36
  from tools.secure_path_utils import validate_folder_containment
37
  from tools.secure_regex_utils import safe_sanitize_text
38
 
@@ -177,6 +183,7 @@ class OCRResult:
177
  height: int
178
  conf: float = None
179
  line: int = None
 
180
 
181
 
182
  @dataclass
@@ -368,30 +375,88 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
368
  adjusted_contrast = contrast
369
  return adjusted_image, contrast, adjusted_contrast
370
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
371
  def preprocess_image(
372
- self, image: Image.Image, perform_binarization: bool = False
 
 
 
373
  ) -> Tuple[Image.Image, dict]:
374
  """
375
- A corrected, logical pipeline for OCR preprocessing.
376
- Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
377
-
378
- I have found that binarization is not always helpful with Tesseract, and can sometimes degrade results. So it is off by default.
379
  """
380
- # 1. Convert to greyscale NumPy array
381
- image_np = self.convert_image_to_array(image)
382
-
383
- # 2. Rescale image to optimal DPI (while still greyscale)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
384
  rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
385
- image_np
386
  )
387
 
388
- # 3. Apply bilateral filtering for noise reduction
 
 
 
389
  filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
390
 
391
- # 4. Improve contrast
392
  adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
393
 
394
- # 5. Adaptive Thresholding (Binarization) - This is the final step
395
  if perform_binarization:
396
  final_image_np, threshold_metadata = (
397
  self.adaptive_threshold.preprocess_image(adjusted_image_np)
@@ -404,7 +469,8 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
404
  final_metadata = {**scale_metadata, **threshold_metadata}
405
 
406
  # Convert final numpy array back to PIL Image for return
407
- return Image.fromarray(final_image_np), final_metadata
 
408
 
409
 
410
  def rescale_ocr_data(ocr_data, scale_factor: float):
@@ -447,10 +513,6 @@ def filter_entities_for_language(
447
  print(f"No entities provided for language: {language}")
448
  # raise Warning(f"No entities provided for language: {language}")
449
 
450
- # print("entities:", entities)
451
- # print("valid_language_entities:", valid_language_entities)
452
- # print("language:", language)
453
-
454
  filtered_entities = [
455
  entity for entity in entities if entity in valid_language_entities
456
  ]
@@ -467,6 +529,75 @@ def filter_entities_for_language(
467
  return filtered_entities
468
 
469
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
470
  class CustomImageAnalyzerEngine:
471
  def __init__(
472
  self,
@@ -481,9 +612,9 @@ class CustomImageAnalyzerEngine:
481
  """
482
  Initializes the CustomImageAnalyzerEngine.
483
 
484
- :param ocr_engine: The OCR engine to use ("tesseract", "hybrid", or "paddle").
485
  :param analyzer_engine: The Presidio AnalyzerEngine instance.
486
- :param tesseract_config: Configuration string for Tesseract.
487
  :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
488
  :param image_preprocessor: Optional image preprocessor.
489
  :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
@@ -511,7 +642,7 @@ class CustomImageAnalyzerEngine:
511
  )
512
  self.output_folder = normalized_output_folder
513
 
514
- if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
515
  if PaddleOCR is None:
516
  raise ImportError(
517
  "paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle' in your python environment and retry."
@@ -538,22 +669,39 @@ class CustomImageAnalyzerEngine:
538
  paddle_kwargs.setdefault("lang", self.paddle_lang)
539
  self.paddle_ocr = PaddleOCR(**paddle_kwargs)
540
 
 
 
 
 
 
 
541
  if not analyzer_engine:
542
  analyzer_engine = AnalyzerEngine()
543
  self.analyzer_engine = analyzer_engine
544
 
545
- self.tesseract_config = tesseract_config or "--oem 3 --psm 11"
 
 
 
 
 
 
 
 
546
 
547
  if not image_preprocessor:
548
  image_preprocessor = ContrastSegmentedImageEnhancer()
549
  self.image_preprocessor = image_preprocessor
550
 
551
- def _sanitize_filename(self, text: str, max_length: int = 20) -> str:
 
 
552
  """
553
  Sanitizes text for use in filenames by removing invalid characters and limiting length.
554
 
555
  :param text: The text to sanitize
556
  :param max_length: Maximum length of the sanitized text
 
557
  :return: Sanitized text safe for filenames
558
  """
559
 
@@ -568,7 +716,7 @@ class CustomImageAnalyzerEngine:
568
 
569
  # If empty after sanitization, use a default value
570
  if not sanitized:
571
- sanitized = "text"
572
 
573
  # Limit to max_length characters
574
  if len(sanitized) > max_length:
@@ -576,8 +724,139 @@ class CustomImageAnalyzerEngine:
576
  # Ensure we don't end with an underscore if we cut in the middle
577
  sanitized = sanitized.rstrip("_")
578
 
 
 
 
 
579
  return sanitized
580
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  def _convert_paddle_to_tesseract_format(
582
  self, paddle_results: List[Any]
583
  ) -> Dict[str, List]:
@@ -621,34 +900,207 @@ class CustomImageAnalyzerEngine:
621
  line_width = float(max(x_coords) - line_left)
622
  line_height = float(max(y_coords) - line_top)
623
 
624
- # 2. Split the line into words
625
- words = line_text.split()
626
- if not words:
627
- continue
 
 
 
628
 
629
- # 3. Estimate bounding box for each word
630
- total_chars = len(line_text)
631
- # Avoid division by zero for empty lines
632
- avg_char_width = line_width / total_chars if total_chars > 0 else 0
633
 
634
- current_char_offset = 0
 
 
 
 
 
 
 
 
635
 
636
- for word in words:
637
- word_width = float(len(word) * avg_char_width)
638
- word_left = line_left + float(current_char_offset * avg_char_width)
 
 
 
 
639
 
640
- output["text"].append(word)
641
- output["left"].append(word_left)
642
- output["top"].append(line_top)
643
- output["width"].append(word_width)
644
- output["height"].append(line_height)
645
- # Use the line's confidence for each word derived from it
646
- output["conf"].append(int(line_confidence * 100))
647
 
648
- # Update offset for the next word (add word length + 1 for the space)
649
- current_char_offset += len(word) + 1
650
 
651
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
652
 
653
  def _perform_hybrid_ocr(
654
  self,
@@ -659,16 +1111,20 @@ class CustomImageAnalyzerEngine:
659
  image_name: str = "unknown_image_name",
660
  ) -> Dict[str, list]:
661
  """
662
- Performs OCR using Tesseract for bounding boxes and PaddleOCR for low-confidence text.
663
  Returns data in the same dictionary format as pytesseract.image_to_data.
664
  """
665
- if ocr is None:
666
- if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
667
- ocr = self.paddle_ocr
668
- else:
669
- raise ValueError(
670
- "No OCR object provided and 'paddle_ocr' is not initialized."
671
- )
 
 
 
 
672
 
673
  print("Starting hybrid OCR process...")
674
 
@@ -687,6 +1143,7 @@ class CustomImageAnalyzerEngine:
687
  "width": list(),
688
  "height": list(),
689
  "conf": list(),
 
690
  }
691
 
692
  num_words = len(tesseract_data["text"])
@@ -707,6 +1164,9 @@ class CustomImageAnalyzerEngine:
707
  height = tesseract_data["height"][i]
708
  # line_number = tesseract_data['abs_line_id'][i]
709
 
 
 
 
710
  # If confidence is low, use PaddleOCR for a second opinion
711
  if conf < confidence_threshold:
712
  img_width, img_height = image.size
@@ -722,82 +1182,90 @@ class CustomImageAnalyzerEngine:
722
  cropped_image = image.crop(
723
  (crop_left, crop_top, crop_right, crop_bottom)
724
  )
725
- cropped_image_np = np.array(cropped_image)
 
 
 
 
 
 
 
726
 
727
- if len(cropped_image_np.shape) == 2:
728
- cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
729
 
730
- paddle_results = ocr.predict(cropped_image_np)
731
 
732
- if paddle_results and paddle_results[0]:
733
- rec_texts = paddle_results[0].get("rec_texts", [])
734
- rec_scores = paddle_results[0].get("rec_scores", [])
 
 
 
735
 
736
- if rec_texts and rec_scores:
737
- new_text = " ".join(rec_texts)
738
- new_conf = int(round(np.median(rec_scores) * 100, 0))
739
 
740
- # Only replace if Paddle's confidence is better
741
- if new_conf > conf:
742
- print(
743
- f" Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f})"
744
- )
 
745
 
746
- # For exporting example image comparisons, not used here
747
- safe_text = self._sanitize_filename(text, max_length=20)
748
- self._sanitize_filename(new_text, max_length=20)
749
-
750
- if SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES is True:
751
- # Normalize and validate image_name to prevent path traversal attacks
752
- normalized_image_name = os.path.normpath(image_name)
753
- # Ensure the image name doesn't contain path traversal characters
754
- if (
755
- ".." in normalized_image_name
756
- or "/" in normalized_image_name
757
- or "\\" in normalized_image_name
758
- ):
759
- normalized_image_name = (
760
- "safe_image" # Fallback to safe default
761
- )
762
-
763
- tess_vs_paddle_examples_folder = (
764
- self.output_folder
765
- + f"/tess_vs_paddle_examples/{normalized_image_name}"
766
- )
767
- # Validate the constructed path is safe before creating directories
768
- if not validate_folder_containment(
769
- tess_vs_paddle_examples_folder, OUTPUT_FOLDER
770
- ):
771
- raise ValueError(
772
- f"Unsafe tess_vs_paddle_examples folder path: {tess_vs_paddle_examples_folder}"
773
- )
774
-
775
- if not os.path.exists(tess_vs_paddle_examples_folder):
776
- os.makedirs(tess_vs_paddle_examples_folder)
777
- output_image_path = (
778
- tess_vs_paddle_examples_folder
779
- + f"/{safe_text}_conf_{conf}_to_{new_text}_conf_{new_conf}.png"
780
  )
781
- print(f"Saving example image to {output_image_path}")
782
- cropped_image.save(output_image_path)
783
 
784
- text = new_text
785
- conf = new_conf
 
 
 
 
 
 
 
 
 
786
 
787
- else:
788
- print(
789
- f" '{text}' (conf: {conf}) -> Paddle result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
 
790
  )
 
 
 
 
 
 
 
791
  else:
792
- # Paddle ran but found nothing, so discard the original low-confidence word
793
  print(
794
- f" '{text}' (conf: {conf}) -> No text found by Paddle. Discarding."
795
  )
796
- text = ""
797
  else:
798
- # Paddle found nothing, discard original word
 
799
  print(
800
- f" '{text}' (conf: {conf}) -> No text found by Paddle. Discarding."
801
  )
802
  text = ""
803
 
@@ -809,6 +1277,7 @@ class CustomImageAnalyzerEngine:
809
  final_data["width"].append(width)
810
  final_data["height"].append(height)
811
  final_data["conf"].append(int(conf))
 
812
  # final_data['line_number'].append(int(line_number))
813
 
814
  return final_data
@@ -839,10 +1308,14 @@ class CustomImageAnalyzerEngine:
839
  image_width, image_height = image.size
840
 
841
  # Note: In testing I haven't seen that this necessarily improves results
842
- if self.ocr_engine == "hybrid":
843
  # Try hybrid with original image for cropping:
844
  ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
845
 
 
 
 
 
846
  elif self.ocr_engine == "tesseract":
847
 
848
  ocr_data = pytesseract.image_to_data(
@@ -852,6 +1325,15 @@ class CustomImageAnalyzerEngine:
852
  lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
853
  )
854
 
 
 
 
 
 
 
 
 
 
855
  elif self.ocr_engine == "paddle":
856
 
857
  if ocr is None:
@@ -903,15 +1385,33 @@ class CustomImageAnalyzerEngine:
903
 
904
  ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
905
 
 
 
 
 
 
 
 
 
 
906
  else:
907
  raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
908
 
909
- if preprocessing_metadata:
910
- scale_factor = preprocessing_metadata.get("scale_factor", 1.0)
911
- if scale_factor != 1.0:
912
- print(f"Rescaling OCR data by scale factor: {scale_factor}")
913
- print(f"OCR data before rescaling: {ocr_data}")
 
 
 
 
 
 
 
 
914
  ocr_data = rescale_ocr_data(ocr_data, scale_factor)
 
915
 
916
  # The rest of your processing pipeline now works for both engines
917
  ocr_result = ocr_data
@@ -923,6 +1423,20 @@ class CustomImageAnalyzerEngine:
923
  if text.strip() and int(ocr_result["conf"][i]) > 0
924
  ]
925
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
926
  return [
927
  OCRResult(
928
  text=clean_unicode_text(ocr_result["text"][i]),
@@ -931,6 +1445,7 @@ class CustomImageAnalyzerEngine:
931
  width=ocr_result["width"][i],
932
  height=ocr_result["height"][i],
933
  conf=round(float(ocr_result["conf"][i]), 0),
 
934
  # line_number=ocr_result['abs_line_id'][i]
935
  )
936
  for i in valid_indices
@@ -987,8 +1502,6 @@ class CustomImageAnalyzerEngine:
987
  if language_supported_entities:
988
  text_analyzer_kwargs["entities"] = language_supported_entities
989
 
990
- # if language != "en":
991
- # gr.Info(f"Using {str(language_supported_entities)} entities for local model analysis for language: {language}")
992
  else:
993
  print(f"No relevant entities supported for language: {language}")
994
  raise Warning(
@@ -1944,6 +2457,7 @@ def create_ocr_result_with_children(
1944
  word.top + word.height,
1945
  ),
1946
  "conf": word.conf,
 
1947
  }
1948
  for word in current_line
1949
  ],
 
17
 
18
  from tools.config import (
19
  AWS_PII_OPTION,
20
+ CONVERT_LINE_TO_WORD_LEVEL,
21
  DEFAULT_LANGUAGE,
22
  HYBRID_OCR_CONFIDENCE_THRESHOLD,
23
  HYBRID_OCR_PADDING,
24
  LOCAL_OCR_MODEL_OPTIONS,
25
  LOCAL_PII_OPTION,
26
+ MAX_NEW_TOKENS,
27
  OUTPUT_FOLDER,
28
  PADDLE_DET_DB_UNCLIP_RATIO,
29
  PADDLE_MODEL_PATH,
30
  PADDLE_USE_TEXTLINE_ORIENTATION,
31
  PREPROCESS_LOCAL_OCR_IMAGES,
32
+ SAVE_EXAMPLE_HYBRID_IMAGES,
33
  SAVE_PADDLE_VISUALISATIONS,
34
+ SAVE_TESSERACT_VISUALISATIONS,
35
+ SELECTED_MODEL,
36
+ TESSERACT_SEGMENTATION_LEVEL,
37
  )
38
  from tools.helper_functions import clean_unicode_text
39
  from tools.load_spacy_model_custom_recognisers import custom_entities
40
  from tools.presidio_analyzer_custom import recognizer_result_from_dict
41
+ from tools.run_vlm import generate_image as vlm_generate_image
42
  from tools.secure_path_utils import validate_folder_containment
43
  from tools.secure_regex_utils import safe_sanitize_text
44
 
 
183
  height: int
184
  conf: float = None
185
  line: int = None
186
+ model: str = None # Track which OCR model was used (e.g., "Tesseract", "Paddle", "VLM")
187
 
188
 
189
  @dataclass
 
375
  adjusted_contrast = contrast
376
  return adjusted_image, contrast, adjusted_contrast
377
 
378
+ def _deskew(self, image_np: np.ndarray) -> np.ndarray:
379
+ """
380
+ Corrects the skew of an image.
381
+ This method works best on a grayscaled image.
382
+ """
383
+ # We'll work with a copy for angle detection
384
+ gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) if len(image_np.shape) == 3 else image_np.copy()
385
+
386
+ # Invert the image for contour finding
387
+ thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
388
+
389
+ coords = np.column_stack(np.where(thresh > 0))
390
+ angle = cv2.minAreaRect(coords)[-1]
391
+
392
+ # Adjust the angle for rotation
393
+ if angle < -45:
394
+ angle = -(90 + angle)
395
+ else:
396
+ angle = -angle
397
+
398
+ # Don't rotate if the angle is negligible
399
+ if abs(angle) < 0.1:
400
+ return image_np
401
+
402
+ (h, w) = image_np.shape[:2]
403
+ center = (w // 2, h // 2)
404
+ M = cv2.getRotationMatrix2D(center, angle, 1.0)
405
+
406
+ # Use the original numpy image for the rotation to preserve quality
407
+ rotated = cv2.warpAffine(
408
+ image_np, M, (w, h),
409
+ flags=cv2.INTER_CUBIC,
410
+ borderMode=cv2.BORDER_REPLICATE
411
+ )
412
+
413
+ return rotated
414
+
415
  def preprocess_image(
416
+ self,
417
+ image: Image.Image,
418
+ perform_deskew: bool = False,
419
+ perform_binarization: bool = False,
420
  ) -> Tuple[Image.Image, dict]:
421
  """
422
+ A pipeline for OCR preprocessing.
423
+ Order: Deskew -> Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
 
 
424
  """
425
+ # 1. Convert PIL image to NumPy array for OpenCV processing
426
+ # Assuming the original image is RGB
427
+ image_np = np.array(image.convert("RGB"))
428
+ # OpenCV uses BGR, so we convert RGB to BGR
429
+ image_np_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
430
+
431
+ # --- REVISED PIPELINE ---
432
+
433
+ # 2. Deskew the image (critical new step)
434
+ # This is best done early on the full-quality image.
435
+ if perform_deskew:
436
+ deskewed_image_np = self._deskew(image_np_bgr)
437
+ else:
438
+ deskewed_image_np = image_np_bgr
439
+
440
+ # 3. Convert to greyscale
441
+ # Your convert_image_to_array probably does this, but for clarity:
442
+ gray_image_np = cv2.cvtColor(deskewed_image_np, cv2.COLOR_BGR2GRAY)
443
+
444
+ # 4. Rescale image to optimal DPI
445
+ # Assuming your image_rescaling object can handle a greyscale numpy array
446
  rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
447
+ gray_image_np
448
  )
449
 
450
+ # 5. Apply filtering for noise reduction
451
+ # Suggestion: A Median filter is often very effective for scanned docs
452
+ # filtered_image_np = cv2.medianBlur(rescaled_image_np, 3)
453
+ # Or using your existing bilateral filter:
454
  filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
455
 
456
+ # 6. Improve contrast
457
  adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
458
 
459
+ # 7. Adaptive Thresholding (Binarization) - Final optional step
460
  if perform_binarization:
461
  final_image_np, threshold_metadata = (
462
  self.adaptive_threshold.preprocess_image(adjusted_image_np)
 
469
  final_metadata = {**scale_metadata, **threshold_metadata}
470
 
471
  # Convert final numpy array back to PIL Image for return
472
+ # The final image is greyscale, so it's safe to use 'L' mode
473
+ return Image.fromarray(final_image_np).convert('L'), final_metadata
474
 
475
 
476
  def rescale_ocr_data(ocr_data, scale_factor: float):
 
513
  print(f"No entities provided for language: {language}")
514
  # raise Warning(f"No entities provided for language: {language}")
515
 
 
 
 
 
516
  filtered_entities = [
517
  entity for entity in entities if entity in valid_language_entities
518
  ]
 
529
  return filtered_entities
530
 
531
 
532
+ def _get_tesseract_psm(segmentation_level: str) -> int:
533
+ """
534
+ Get the appropriate Tesseract PSM (Page Segmentation Mode) value based on segmentation level.
535
+
536
+ Args:
537
+ segmentation_level: "word" or "line"
538
+
539
+ Returns:
540
+ PSM value for Tesseract configuration
541
+ """
542
+ if segmentation_level.lower() == "line":
543
+ return 6 # Uniform block of text
544
+ elif segmentation_level.lower() == "word":
545
+ return 11 # Sparse text (word-level)
546
+ else:
547
+ print(
548
+ f"Warning: Unknown segmentation level '{segmentation_level}', defaulting to word-level (PSM 11)"
549
+ )
550
+ return 11
551
+
552
+
553
+ def _vlm_ocr_predict(
554
+ image: Image.Image,
555
+ prompt: str = "Extract all text from this image. Return only the text, no other information.",
556
+ ) -> Dict[str, Any]:
557
+ """
558
+ VLM OCR prediction function that mimics PaddleOCR's interface.
559
+
560
+ Args:
561
+ image: PIL Image to process
562
+ prompt: Text prompt for the VLM
563
+
564
+ Returns:
565
+ Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
566
+ """
567
+ try:
568
+ # Use the VLM to extract text
569
+ extracted_text = vlm_generate_image(
570
+ text=prompt,
571
+ image=image,
572
+ max_new_tokens=MAX_NEW_TOKENS,
573
+ temperature=0.7,
574
+ top_p=0.9,
575
+ top_k=50,
576
+ repetition_penalty=1.3,
577
+ )
578
+
579
+ if extracted_text and extracted_text.strip():
580
+ # Clean the text
581
+ cleaned_text = extracted_text.strip()
582
+
583
+ # Split into words for compatibility with PaddleOCR format
584
+ words = cleaned_text.split()
585
+
586
+ # Create PaddleOCR-compatible result
587
+ result = {
588
+ "rec_texts": words,
589
+ "rec_scores": [0.95] * len(words), # High confidence for VLM results
590
+ }
591
+
592
+ return result
593
+ else:
594
+ return {"rec_texts": [], "rec_scores": []}
595
+
596
+ except Exception as e:
597
+ print(f"VLM OCR error: {e}")
598
+ return {"rec_texts": [], "rec_scores": []}
599
+
600
+
601
  class CustomImageAnalyzerEngine:
602
  def __init__(
603
  self,
 
612
  """
613
  Initializes the CustomImageAnalyzerEngine.
614
 
615
+ :param ocr_engine: The OCR engine to use ("tesseract", "hybrid-paddle", "hybrid-vlm", or "paddle").
616
  :param analyzer_engine: The Presidio AnalyzerEngine instance.
617
+ :param tesseract_config: Configuration string for Tesseract. If None, uses TESSERACT_SEGMENTATION_LEVEL config.
618
  :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
619
  :param image_preprocessor: Optional image preprocessor.
620
  :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
 
642
  )
643
  self.output_folder = normalized_output_folder
644
 
645
+ if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle":
646
  if PaddleOCR is None:
647
  raise ImportError(
648
  "paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle' in your python environment and retry."
 
669
  paddle_kwargs.setdefault("lang", self.paddle_lang)
670
  self.paddle_ocr = PaddleOCR(**paddle_kwargs)
671
 
672
+ elif self.ocr_engine == "hybrid-vlm":
673
+ # VLM-based hybrid OCR - no additional initialization needed
674
+ # The VLM model is loaded when run_vlm.py is imported
675
+ print(f"Initializing hybrid VLM OCR with model: {SELECTED_MODEL}")
676
+ self.paddle_ocr = None # Not using PaddleOCR
677
+
678
  if not analyzer_engine:
679
  analyzer_engine = AnalyzerEngine()
680
  self.analyzer_engine = analyzer_engine
681
 
682
+ # Set Tesseract configuration based on segmentation level
683
+ if tesseract_config:
684
+ self.tesseract_config = tesseract_config
685
+ else:
686
+ psm_value = _get_tesseract_psm(TESSERACT_SEGMENTATION_LEVEL)
687
+ self.tesseract_config = f"--oem 3 --psm {psm_value}"
688
+ # print(
689
+ # f"Tesseract configured for {TESSERACT_SEGMENTATION_LEVEL}-level segmentation (PSM {psm_value})"
690
+ # )
691
 
692
  if not image_preprocessor:
693
  image_preprocessor = ContrastSegmentedImageEnhancer()
694
  self.image_preprocessor = image_preprocessor
695
 
696
+ def _sanitize_filename(
697
+ self, text: str, max_length: int = 20, fallback_prefix: str = "unknown_text"
698
+ ) -> str:
699
  """
700
  Sanitizes text for use in filenames by removing invalid characters and limiting length.
701
 
702
  :param text: The text to sanitize
703
  :param max_length: Maximum length of the sanitized text
704
+ :param fallback_prefix: Prefix to use if sanitization fails
705
  :return: Sanitized text safe for filenames
706
  """
707
 
 
716
 
717
  # If empty after sanitization, use a default value
718
  if not sanitized:
719
+ sanitized = fallback_prefix
720
 
721
  # Limit to max_length characters
722
  if len(sanitized) > max_length:
 
724
  # Ensure we don't end with an underscore if we cut in the middle
725
  sanitized = sanitized.rstrip("_")
726
 
727
+ # Final check: if still empty or too short, use fallback
728
+ if not sanitized or len(sanitized) < 3:
729
+ sanitized = fallback_prefix
730
+
731
  return sanitized
732
 
733
+ def _create_safe_filename_with_confidence(
734
+ self,
735
+ original_text: str,
736
+ new_text: str,
737
+ conf: int,
738
+ new_conf: int,
739
+ ocr_type: str = "OCR",
740
+ ) -> str:
741
+ """
742
+ Creates a safe filename using confidence values when text sanitization fails.
743
+
744
+ Args:
745
+ original_text: Original text from Tesseract
746
+ new_text: New text from VLM/PaddleOCR
747
+ conf: Original confidence score
748
+ new_conf: New confidence score
749
+ ocr_type: Type of OCR used (VLM, Paddle, etc.)
750
+
751
+ Returns:
752
+ Safe filename string
753
+ """
754
+ # Try to sanitize both texts
755
+ safe_original = self._sanitize_filename(
756
+ original_text, max_length=15, fallback_prefix=f"orig_conf_{conf}"
757
+ )
758
+ safe_new = self._sanitize_filename(
759
+ new_text, max_length=15, fallback_prefix=f"new_conf_{new_conf}"
760
+ )
761
+
762
+ # If both sanitizations resulted in fallback names, create a confidence-based name
763
+ if safe_original.startswith("unknown_text") and safe_new.startswith(
764
+ "unknown_text"
765
+ ):
766
+ return f"{ocr_type}_conf_{conf}_to_conf_{new_conf}"
767
+
768
+ return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
769
+
770
+ def _convert_line_to_word_level(
771
+ self, line_data: Dict[str, List], image_width: int, image_height: int
772
+ ) -> Dict[str, List]:
773
+ """
774
+ Converts line-level OCR results to word-level results by splitting text and estimating word positions.
775
+
776
+ Args:
777
+ line_data: Dictionary with line-level OCR data (text, left, top, width, height, conf)
778
+ image_width: Width of the original image
779
+ image_height: Height of the original image
780
+
781
+ Returns:
782
+ Dictionary with word-level OCR data in Tesseract format
783
+ """
784
+ output = {
785
+ "text": list(),
786
+ "left": list(),
787
+ "top": list(),
788
+ "width": list(),
789
+ "height": list(),
790
+ "conf": list(),
791
+ }
792
+
793
+ if not line_data or not line_data.get("text"):
794
+ return output
795
+
796
+ for i in range(len(line_data["text"])):
797
+ line_text = line_data["text"][i]
798
+ line_left = line_data["left"][i]
799
+ line_top = line_data["top"][i]
800
+ line_width = line_data["width"][i]
801
+ line_height = line_data["height"][i]
802
+ line_conf = line_data["conf"][i]
803
+
804
+ # Skip empty lines
805
+ if not line_text.strip():
806
+ continue
807
+
808
+ # Split line into words
809
+ words = line_text.split()
810
+ if not words:
811
+ continue
812
+
813
+ # Calculate character width for this line
814
+ total_chars = len(line_text)
815
+ avg_char_width = line_width / total_chars if total_chars > 0 else 0
816
+
817
+ current_char_offset = 0
818
+
819
+ for word in words:
820
+ # Calculate word width based on character count
821
+ word_width = float(len(word) * avg_char_width)
822
+ word_left = line_left + float(current_char_offset * avg_char_width)
823
+
824
+ # Ensure word doesn't exceed image boundaries
825
+ word_left = max(0, min(word_left, image_width - word_width))
826
+ word_width = min(word_width, image_width - word_left)
827
+
828
+ output["text"].append(word)
829
+ output["left"].append(word_left)
830
+ output["top"].append(line_top)
831
+ output["width"].append(word_width)
832
+ output["height"].append(line_height)
833
+ output["conf"].append(line_conf)
834
+
835
+ # Update offset for the next word (add word length + 1 for the space)
836
+ current_char_offset += len(word) + 1
837
+
838
+ return output
839
+
840
+ def _is_line_level_data(self, ocr_data: Dict[str, List]) -> bool:
841
+ """
842
+ Determines if OCR data contains line-level results (multiple words per bounding box).
843
+
844
+ Args:
845
+ ocr_data: Dictionary with OCR data
846
+
847
+ Returns:
848
+ True if data appears to be line-level, False otherwise
849
+ """
850
+ if not ocr_data or not ocr_data.get("text"):
851
+ return False
852
+
853
+ # Check if any text entries contain multiple words
854
+ for text in ocr_data["text"]:
855
+ if text.strip() and len(text.split()) > 1:
856
+ return True
857
+
858
+ return False
859
+
860
  def _convert_paddle_to_tesseract_format(
861
  self, paddle_results: List[Any]
862
  ) -> Dict[str, List]:
 
900
  line_width = float(max(x_coords) - line_left)
901
  line_height = float(max(y_coords) - line_top)
902
 
903
+ # Add line-level data
904
+ output["text"].append(line_text)
905
+ output["left"].append(line_left)
906
+ output["top"].append(line_top)
907
+ output["width"].append(line_width)
908
+ output["height"].append(line_height)
909
+ output["conf"].append(int(line_confidence * 100))
910
 
911
+ return output
 
 
 
912
 
913
+ def _visualize_tesseract_bounding_boxes(
914
+ self,
915
+ image: Image.Image,
916
+ ocr_data: Dict[str, List],
917
+ image_name: str = None,
918
+ visualisation_folder: str = "tesseract_visualisations",
919
+ ) -> None:
920
+ """
921
+ Visualizes Tesseract OCR bounding boxes with confidence-based colors and a legend.
922
 
923
+ Args:
924
+ image: The PIL Image object
925
+ ocr_data: Tesseract OCR data dictionary
926
+ image_name: Optional name for the saved image file
927
+ """
928
+ if not ocr_data or not ocr_data.get("text"):
929
+ return
930
 
931
+ # Convert PIL image to OpenCV format
932
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
 
 
 
 
 
933
 
934
+ # Get image dimensions
935
+ height, width = image_cv.shape[:2]
936
 
937
+ # Define confidence ranges and colors
938
+ confidence_ranges = [
939
+ (80, 100, (0, 255, 0), "High (80-100%)"), # Green
940
+ (50, 79, (0, 165, 255), "Medium (50-79%)"), # Orange
941
+ (0, 49, (0, 0, 255), "Low (0-49%)"), # Red
942
+ ]
943
+
944
+ # Process each detected text element
945
+ for i in range(len(ocr_data["text"])):
946
+ text = ocr_data["text"][i]
947
+ conf = int(ocr_data["conf"][i])
948
+
949
+ # Skip empty text or invalid confidence
950
+ if not text.strip() or conf == -1:
951
+ continue
952
+
953
+ left = ocr_data["left"][i]
954
+ top = ocr_data["top"][i]
955
+ width_box = ocr_data["width"][i]
956
+ height_box = ocr_data["height"][i]
957
+
958
+ # Calculate bounding box coordinates
959
+ x1 = int(left)
960
+ y1 = int(top)
961
+ x2 = int(left + width_box)
962
+ y2 = int(top + height_box)
963
+
964
+ # Ensure coordinates are within image bounds
965
+ x1 = max(0, min(x1, width))
966
+ y1 = max(0, min(y1, height))
967
+ x2 = max(0, min(x2, width))
968
+ y2 = max(0, min(y2, height))
969
+
970
+ # Skip if bounding box is invalid
971
+ if x2 <= x1 or y2 <= y1:
972
+ continue
973
+
974
+ # Determine color based on confidence score
975
+ color = (0, 0, 255) # Default to red
976
+ for min_conf, max_conf, conf_color, _ in confidence_ranges:
977
+ if min_conf <= conf <= max_conf:
978
+ color = conf_color
979
+ break
980
+
981
+ # Draw bounding box
982
+ cv2.rectangle(image_cv, (x1, y1), (x2, y2), color, 1)
983
+
984
+ # Add legend
985
+ self._add_confidence_legend(image_cv, confidence_ranges)
986
+
987
+ # Save the visualization
988
+ tesseract_viz_folder = os.path.join(self.output_folder, visualisation_folder)
989
+
990
+ # Double-check the constructed path is safe
991
+ if not validate_folder_containment(tesseract_viz_folder, OUTPUT_FOLDER):
992
+ raise ValueError(
993
+ f"Unsafe tesseract visualisations folder path: {tesseract_viz_folder}"
994
+ )
995
+
996
+ os.makedirs(tesseract_viz_folder, exist_ok=True)
997
+
998
+ # Generate filename
999
+ if image_name:
1000
+ # Remove file extension if present
1001
+ base_name = os.path.splitext(image_name)[0]
1002
+ filename = f"{base_name}_{visualisation_folder}.jpg"
1003
+ else:
1004
+ timestamp = int(time.time())
1005
+ filename = f"{visualisation_folder}_{timestamp}.jpg"
1006
+
1007
+ output_path = os.path.join(tesseract_viz_folder, filename)
1008
+
1009
+ # Save the image
1010
+ cv2.imwrite(output_path, image_cv)
1011
+ print(f"Tesseract visualization saved to: {output_path}")
1012
+
1013
+ def _add_confidence_legend(
1014
+ self, image_cv: np.ndarray, confidence_ranges: List[Tuple]
1015
+ ) -> None:
1016
+ """
1017
+ Adds a confidence legend to the visualization image.
1018
+
1019
+ Args:
1020
+ image_cv: OpenCV image array
1021
+ confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
1022
+ """
1023
+ height, width = image_cv.shape[:2]
1024
+
1025
+ # Legend parameters
1026
+ legend_width = 200
1027
+ legend_height = 100
1028
+ legend_x = width - legend_width - 20
1029
+ legend_y = 20
1030
+
1031
+ # Draw legend background
1032
+ cv2.rectangle(
1033
+ image_cv,
1034
+ (legend_x, legend_y),
1035
+ (legend_x + legend_width, legend_y + legend_height),
1036
+ (255, 255, 255), # White background
1037
+ -1,
1038
+ )
1039
+ cv2.rectangle(
1040
+ image_cv,
1041
+ (legend_x, legend_y),
1042
+ (legend_x + legend_width, legend_y + legend_height),
1043
+ (0, 0, 0), # Black border
1044
+ 2,
1045
+ )
1046
+
1047
+ # Add title
1048
+ title_text = "Confidence Levels"
1049
+ font_scale = 0.6
1050
+ font_thickness = 2
1051
+ (title_width, title_height), _ = cv2.getTextSize(
1052
+ title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
1053
+ )
1054
+ title_x = legend_x + (legend_width - title_width) // 2
1055
+ title_y = legend_y + title_height + 10
1056
+ cv2.putText(
1057
+ image_cv,
1058
+ title_text,
1059
+ (title_x, title_y),
1060
+ cv2.FONT_HERSHEY_SIMPLEX,
1061
+ font_scale,
1062
+ (0, 0, 0), # Black text
1063
+ font_thickness,
1064
+ )
1065
+
1066
+ # Add confidence range items
1067
+ item_spacing = 25
1068
+ start_y = title_y + 25
1069
+
1070
+ for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
1071
+ item_y = start_y + i * item_spacing
1072
+
1073
+ # Draw color box
1074
+ box_size = 15
1075
+ box_x = legend_x + 10
1076
+ box_y = item_y - box_size
1077
+ cv2.rectangle(
1078
+ image_cv,
1079
+ (box_x, box_y),
1080
+ (box_x + box_size, box_y + box_size),
1081
+ color,
1082
+ -1,
1083
+ )
1084
+ cv2.rectangle(
1085
+ image_cv,
1086
+ (box_x, box_y),
1087
+ (box_x + box_size, box_y + box_size),
1088
+ (0, 0, 0), # Black border
1089
+ 1,
1090
+ )
1091
+
1092
+ # Add label text
1093
+ label_x = box_x + box_size + 10
1094
+ label_y = item_y - 5
1095
+ cv2.putText(
1096
+ image_cv,
1097
+ label,
1098
+ (label_x, label_y),
1099
+ cv2.FONT_HERSHEY_SIMPLEX,
1100
+ 0.5,
1101
+ (0, 0, 0), # Black text
1102
+ 1,
1103
+ )
1104
 
1105
  def _perform_hybrid_ocr(
1106
  self,
 
1111
  image_name: str = "unknown_image_name",
1112
  ) -> Dict[str, list]:
1113
  """
1114
+ Performs OCR using Tesseract for bounding boxes and PaddleOCR/VLM for low-confidence text.
1115
  Returns data in the same dictionary format as pytesseract.image_to_data.
1116
  """
1117
+ # Determine if we're using VLM or PaddleOCR
1118
+ use_vlm = self.ocr_engine == "hybrid-vlm"
1119
+
1120
+ if not use_vlm:
1121
+ if ocr is None:
1122
+ if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
1123
+ ocr = self.paddle_ocr
1124
+ else:
1125
+ raise ValueError(
1126
+ "No OCR object provided and 'paddle_ocr' is not initialized."
1127
+ )
1128
 
1129
  print("Starting hybrid OCR process...")
1130
 
 
1143
  "width": list(),
1144
  "height": list(),
1145
  "conf": list(),
1146
+ "model": list(), # Track which model was used for each word
1147
  }
1148
 
1149
  num_words = len(tesseract_data["text"])
 
1164
  height = tesseract_data["height"][i]
1165
  # line_number = tesseract_data['abs_line_id'][i]
1166
 
1167
+ # Initialize model as Tesseract (default)
1168
+ model_used = "Tesseract"
1169
+
1170
  # If confidence is low, use PaddleOCR for a second opinion
1171
  if conf < confidence_threshold:
1172
  img_width, img_height = image.size
 
1182
  cropped_image = image.crop(
1183
  (crop_left, crop_top, crop_right, crop_bottom)
1184
  )
1185
+ if use_vlm:
1186
+ # Use VLM for OCR
1187
+ vlm_result = _vlm_ocr_predict(cropped_image)
1188
+ rec_texts = vlm_result.get("rec_texts", [])
1189
+ rec_scores = vlm_result.get("rec_scores", [])
1190
+ else:
1191
+ # Use PaddleOCR
1192
+ cropped_image_np = np.array(cropped_image)
1193
 
1194
+ if len(cropped_image_np.shape) == 2:
1195
+ cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
1196
 
1197
+ paddle_results = ocr.predict(cropped_image_np)
1198
 
1199
+ if paddle_results and paddle_results[0]:
1200
+ rec_texts = paddle_results[0].get("rec_texts", [])
1201
+ rec_scores = paddle_results[0].get("rec_scores", [])
1202
+ else:
1203
+ rec_texts = []
1204
+ rec_scores = []
1205
 
1206
+ if rec_texts and rec_scores:
1207
+ new_text = " ".join(rec_texts)
1208
+ new_conf = int(round(np.median(rec_scores) * 100, 0))
1209
 
1210
+ # Only replace if Paddle's/VLM's confidence is better
1211
+ if new_conf > conf:
1212
+ ocr_type = "VLM" if use_vlm else "Paddle"
1213
+ print(
1214
+ f" Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f}) [{ocr_type}]"
1215
+ )
1216
 
1217
+ # For exporting example image comparisons, not used here
1218
+ safe_filename = self._create_safe_filename_with_confidence(
1219
+ text, new_text, conf, new_conf, ocr_type
1220
+ )
1221
+
1222
+ if SAVE_EXAMPLE_HYBRID_IMAGES is True:
1223
+ # Normalize and validate image_name to prevent path traversal attacks
1224
+ normalized_image_name = os.path.normpath(image_name + "_" + ocr_type)
1225
+ # Ensure the image name doesn't contain path traversal characters
1226
+ if (
1227
+ ".." in normalized_image_name
1228
+ or "/" in normalized_image_name
1229
+ or "\\" in normalized_image_name
1230
+ ):
1231
+ normalized_image_name = (
1232
+ "safe_image" # Fallback to safe default
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1233
  )
 
 
1234
 
1235
+ hybrid_ocr_examples_folder = (
1236
+ self.output_folder
1237
+ + f"/hybrid_ocr_examples/{normalized_image_name}"
1238
+ )
1239
+ # Validate the constructed path is safe before creating directories
1240
+ if not validate_folder_containment(
1241
+ hybrid_ocr_examples_folder, OUTPUT_FOLDER
1242
+ ):
1243
+ raise ValueError(
1244
+ f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
1245
+ )
1246
 
1247
+ if not os.path.exists(hybrid_ocr_examples_folder):
1248
+ os.makedirs(hybrid_ocr_examples_folder)
1249
+ output_image_path = (
1250
+ hybrid_ocr_examples_folder + f"/{safe_filename}.png"
1251
  )
1252
+ print(f"Saving example image to {output_image_path}")
1253
+ cropped_image.save(output_image_path)
1254
+
1255
+ text = new_text
1256
+ conf = new_conf
1257
+ model_used = ocr_type # Update model to VLM or Paddle
1258
+
1259
  else:
1260
+ ocr_type = "VLM" if use_vlm else "Paddle"
1261
  print(
1262
+ f" '{text}' (conf: {conf}) -> {ocr_type} result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
1263
  )
 
1264
  else:
1265
+ # OCR ran but found nothing, discard original word
1266
+ ocr_type = "VLM" if use_vlm else "Paddle"
1267
  print(
1268
+ f" '{text}' (conf: {conf}) -> No text found by {ocr_type}. Discarding."
1269
  )
1270
  text = ""
1271
 
 
1277
  final_data["width"].append(width)
1278
  final_data["height"].append(height)
1279
  final_data["conf"].append(int(conf))
1280
+ final_data["model"].append(model_used)
1281
  # final_data['line_number'].append(int(line_number))
1282
 
1283
  return final_data
 
1308
  image_width, image_height = image.size
1309
 
1310
  # Note: In testing I haven't seen that this necessarily improves results
1311
+ if self.ocr_engine == "hybrid-paddle":
1312
  # Try hybrid with original image for cropping:
1313
  ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
1314
 
1315
+ elif self.ocr_engine == "hybrid-vlm":
1316
+ # Try hybrid VLM with original image for cropping:
1317
+ ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
1318
+
1319
  elif self.ocr_engine == "tesseract":
1320
 
1321
  ocr_data = pytesseract.image_to_data(
 
1325
  lang=self.tesseract_lang, # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
1326
  )
1327
 
1328
+ # Save Tesseract visualization with bounding boxes
1329
+ # if SAVE_TESSERACT_VISUALISATIONS is True:
1330
+ # self._visualize_tesseract_bounding_boxes(
1331
+ # image,
1332
+ # ocr_data,
1333
+ # image_name,
1334
+ # visualisation_folder="tesseract_visualisations",
1335
+ # )
1336
+
1337
  elif self.ocr_engine == "paddle":
1338
 
1339
  if ocr is None:
 
1385
 
1386
  ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
1387
 
1388
+ # if SAVE_PADDLE_VISUALISATIONS is True:
1389
+ # # Save Paddle visualization with bounding boxes
1390
+ # self._visualize_tesseract_bounding_boxes(
1391
+ # image,
1392
+ # ocr_data,
1393
+ # image_name,
1394
+ # visualisation_folder="paddle_visualisations",
1395
+ # )
1396
+
1397
  else:
1398
  raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
1399
 
1400
+ # Convert line-level results to word-level if configured and needed
1401
+ if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
1402
+ print("Converting line-level OCR results to word-level...")
1403
+ ocr_data = self._convert_line_to_word_level(
1404
+ ocr_data, image_width, image_height
1405
+ )
1406
+
1407
+ # Always check for scale_factor, even if preprocessing_metadata is empty
1408
+ # This ensures rescaling happens correctly when preprocessing was applied
1409
+ scale_factor = preprocessing_metadata.get("scale_factor", 1.0) if preprocessing_metadata else 1.0
1410
+ if scale_factor != 1.0:
1411
+ # print(f"Rescaling OCR data by scale factor: {scale_factor} (converting from preprocessed to original image coordinates)")
1412
+ # print(f"OCR data before rescaling (first 3 entries): {dict((k, v[:3] if isinstance(v, list) else v) for k, v in list(ocr_data.items())[:3])}")
1413
  ocr_data = rescale_ocr_data(ocr_data, scale_factor)
1414
+ # print(f"OCR data after rescaling (first 3 entries): {dict((k, v[:3] if isinstance(v, list) else v) for k, v in list(ocr_data.items())[:3])}")
1415
 
1416
  # The rest of your processing pipeline now works for both engines
1417
  ocr_result = ocr_data
 
1423
  if text.strip() and int(ocr_result["conf"][i]) > 0
1424
  ]
1425
 
1426
+ # Determine default model based on OCR engine if model field is not present
1427
+ if "model" in ocr_result and len(ocr_result["model"]) == len(ocr_result["text"]):
1428
+ # Model field exists and has correct length - use it
1429
+ get_model = lambda idx: ocr_result["model"][idx]
1430
+ else:
1431
+ # Model field not present or incorrect length - use default based on engine
1432
+ default_model = (
1433
+ "Tesseract" if self.ocr_engine == "tesseract" else
1434
+ "Paddle" if self.ocr_engine == "paddle" else
1435
+ "hybrid-paddle" if self.ocr_engine == "hybrid-paddle" else
1436
+ "VLM" if self.ocr_engine == "hybrid-vlm" else None
1437
+ )
1438
+ get_model = lambda idx: default_model
1439
+
1440
  return [
1441
  OCRResult(
1442
  text=clean_unicode_text(ocr_result["text"][i]),
 
1445
  width=ocr_result["width"][i],
1446
  height=ocr_result["height"][i],
1447
  conf=round(float(ocr_result["conf"][i]), 0),
1448
+ model=get_model(i),
1449
  # line_number=ocr_result['abs_line_id'][i]
1450
  )
1451
  for i in valid_indices
 
1502
  if language_supported_entities:
1503
  text_analyzer_kwargs["entities"] = language_supported_entities
1504
 
 
 
1505
  else:
1506
  print(f"No relevant entities supported for language: {language}")
1507
  raise Warning(
 
2457
  word.top + word.height,
2458
  ),
2459
  "conf": word.conf,
2460
+ "model": word.model,
2461
  }
2462
  for word in current_line
2463
  ],
tools/file_redaction.py CHANGED
@@ -8,7 +8,9 @@ from datetime import datetime
8
  from typing import Any, Dict, List, Optional, Tuple
9
 
10
  import boto3
 
11
  import gradio as gr
 
12
  import pandas as pd
13
  import pymupdf
14
  from gradio import Progress
@@ -53,11 +55,14 @@ from tools.config import (
53
  MAX_TIME_VALUE,
54
  NO_REDACTION_PII_OPTION,
55
  OUTPUT_FOLDER,
 
56
  PAGE_BREAK_VALUE,
57
  PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
58
  RETURN_PDF_FOR_REVIEW,
59
  RETURN_REDACTED_PDF,
60
  RUN_AWS_FUNCTIONS,
 
 
61
  SELECTABLE_TEXT_EXTRACT_OPTION,
62
  TESSERACT_TEXT_EXTRACT_OPTION,
63
  TEXTRACT_TEXT_EXTRACT_OPTION,
@@ -104,6 +109,7 @@ from tools.load_spacy_model_custom_recognisers import (
104
  )
105
  from tools.secure_path_utils import (
106
  secure_file_write,
 
107
  validate_path_containment,
108
  )
109
 
@@ -322,7 +328,7 @@ def choose_and_run_redactor(
322
  - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
323
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
324
  - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
325
- - chosen_local_model (str): Which local model is being used for OCR on images - uses the value of CHOSEN_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
326
  - language (str, optional): The language of the text in the files. Defaults to English.
327
  - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
328
  - ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
@@ -978,8 +984,10 @@ def choose_and_run_redactor(
978
  )
979
 
980
  if not all_page_line_level_ocr_results_with_words:
981
- if local_ocr_output_found_checkbox is True and os.path.exists(
982
- all_page_line_level_ocr_results_with_words_json_file_path
 
 
983
  ):
984
  (
985
  all_page_line_level_ocr_results_with_words,
@@ -1010,7 +1018,7 @@ def choose_and_run_redactor(
1010
  (
1011
  pymupdf_doc,
1012
  all_pages_decision_process_table,
1013
- out_file_paths,
1014
  new_textract_request_metadata,
1015
  annotations_all_pages,
1016
  current_loop_page,
@@ -3118,7 +3126,7 @@ def redact_image_pdf(
3118
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
3119
  - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
3120
  - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
3121
- - chosen_local_model (str, optional): The local model chosen for OCR. Defaults to CHOSEN_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
3122
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
3123
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
3124
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
@@ -3207,11 +3215,16 @@ def redact_image_pdf(
3207
  # If running Textract, check if file already exists. If it does, load in existing data
3208
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
3209
  textract_json_file_path = output_folder + file_name + "_textract.json"
3210
- textract_data, is_missing, log_files_output_paths = (
3211
- load_and_convert_textract_json(
3212
- textract_json_file_path, log_files_output_paths, page_sizes_df
 
 
 
 
 
 
3213
  )
3214
- )
3215
  original_textract_data = textract_data.copy()
3216
 
3217
  # print("Successfully loaded in Textract analysis results from file")
@@ -3221,15 +3234,20 @@ def redact_image_pdf(
3221
  all_page_line_level_ocr_results_with_words_json_file_path = (
3222
  output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
3223
  )
3224
- (
3225
- all_page_line_level_ocr_results_with_words,
3226
- is_missing,
3227
- log_files_output_paths,
3228
- ) = load_and_convert_ocr_results_with_words_json(
3229
- all_page_line_level_ocr_results_with_words_json_file_path,
3230
- log_files_output_paths,
3231
- page_sizes_df,
3232
- )
 
 
 
 
 
3233
  original_all_page_line_level_ocr_results_with_words = (
3234
  all_page_line_level_ocr_results_with_words.copy()
3235
  )
@@ -3536,6 +3554,24 @@ def redact_image_pdf(
3536
  line_level_ocr_results_df.to_dict("records")
3537
  )
3538
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3539
  if (
3540
  pii_identification_method != NO_REDACTION_PII_OPTION
3541
  or RETURN_PDF_FOR_REVIEW is True
@@ -4867,3 +4903,395 @@ def redact_text_pdf(
4867
  comprehend_query_number,
4868
  all_page_line_level_ocr_results_with_words,
4869
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  from typing import Any, Dict, List, Optional, Tuple
9
 
10
  import boto3
11
+ import cv2
12
  import gradio as gr
13
+ import numpy as np
14
  import pandas as pd
15
  import pymupdf
16
  from gradio import Progress
 
55
  MAX_TIME_VALUE,
56
  NO_REDACTION_PII_OPTION,
57
  OUTPUT_FOLDER,
58
+ OVERWRITE_EXISTING_OCR_RESULTS,
59
  PAGE_BREAK_VALUE,
60
  PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
61
  RETURN_PDF_FOR_REVIEW,
62
  RETURN_REDACTED_PDF,
63
  RUN_AWS_FUNCTIONS,
64
+ SAVE_TEXTRACT_VISUALISATIONS,
65
+ SAVE_TESSERACT_VISUALISATIONS,
66
  SELECTABLE_TEXT_EXTRACT_OPTION,
67
  TESSERACT_TEXT_EXTRACT_OPTION,
68
  TEXTRACT_TEXT_EXTRACT_OPTION,
 
109
  )
110
  from tools.secure_path_utils import (
111
  secure_file_write,
112
+ validate_folder_containment,
113
  validate_path_containment,
114
  )
115
 
 
328
  - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
329
  - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
330
  - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
331
+ - chosen_local_model (str): Which local model is being used for OCR on images - uses the value of CHOSEN_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid-paddle" to combine both.
332
  - language (str, optional): The language of the text in the files. Defaults to English.
333
  - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
334
  - ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
 
984
  )
985
 
986
  if not all_page_line_level_ocr_results_with_words:
987
+ if (
988
+ not OVERWRITE_EXISTING_OCR_RESULTS
989
+ and local_ocr_output_found_checkbox is True
990
+ and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path)
991
  ):
992
  (
993
  all_page_line_level_ocr_results_with_words,
 
1018
  (
1019
  pymupdf_doc,
1020
  all_pages_decision_process_table,
1021
+ log_files_output_paths,
1022
  new_textract_request_metadata,
1023
  annotations_all_pages,
1024
  current_loop_page,
 
3126
  - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
3127
  - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
3128
  - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
3129
+ - chosen_local_model (str, optional): The local model chosen for OCR. Defaults to CHOSEN_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid-paddle" for a combination of both.
3130
  - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
3131
  - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
3132
  - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
 
3215
  # If running Textract, check if file already exists. If it does, load in existing data
3216
  if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
3217
  textract_json_file_path = output_folder + file_name + "_textract.json"
3218
+ if OVERWRITE_EXISTING_OCR_RESULTS:
3219
+ # Skip loading existing results, start fresh
3220
+ textract_data = {}
3221
+ is_missing = True
3222
+ else:
3223
+ textract_data, is_missing, log_files_output_paths = (
3224
+ load_and_convert_textract_json(
3225
+ textract_json_file_path, log_files_output_paths, page_sizes_df
3226
+ )
3227
  )
 
3228
  original_textract_data = textract_data.copy()
3229
 
3230
  # print("Successfully loaded in Textract analysis results from file")
 
3234
  all_page_line_level_ocr_results_with_words_json_file_path = (
3235
  output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
3236
  )
3237
+ if OVERWRITE_EXISTING_OCR_RESULTS:
3238
+ # Skip loading existing results, start fresh
3239
+ all_page_line_level_ocr_results_with_words = []
3240
+ is_missing = True
3241
+ else:
3242
+ (
3243
+ all_page_line_level_ocr_results_with_words,
3244
+ is_missing,
3245
+ log_files_output_paths,
3246
+ ) = load_and_convert_ocr_results_with_words_json(
3247
+ all_page_line_level_ocr_results_with_words_json_file_path,
3248
+ log_files_output_paths,
3249
+ page_sizes_df,
3250
+ )
3251
  original_all_page_line_level_ocr_results_with_words = (
3252
  all_page_line_level_ocr_results_with_words.copy()
3253
  )
 
3554
  line_level_ocr_results_df.to_dict("records")
3555
  )
3556
 
3557
+ # Save OCR visualization with bounding boxes (works for all OCR methods)
3558
+ if (
3559
+ text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
3560
+ and SAVE_TEXTRACT_VISUALISATIONS is True
3561
+ ) or (
3562
+ text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
3563
+ and SAVE_TESSERACT_VISUALISATIONS is True
3564
+ ):
3565
+ if page_line_level_ocr_results_with_words and "results" in page_line_level_ocr_results_with_words:
3566
+ log_files_output_paths = visualise_ocr_words_bounding_boxes(
3567
+ image,
3568
+ page_line_level_ocr_results_with_words["results"],
3569
+ image_name=f"{file_name}_{reported_page_number}",
3570
+ output_folder=output_folder,
3571
+ text_extraction_method=text_extraction_method,
3572
+ log_files_output_paths=log_files_output_paths,
3573
+ )
3574
+
3575
  if (
3576
  pii_identification_method != NO_REDACTION_PII_OPTION
3577
  or RETURN_PDF_FOR_REVIEW is True
 
4903
  comprehend_query_number,
4904
  all_page_line_level_ocr_results_with_words,
4905
  )
4906
+
4907
+
4908
+ def visualise_ocr_words_bounding_boxes(
4909
+ image: "Image.Image",
4910
+ ocr_results: Dict[str, Any],
4911
+ image_name: str = None,
4912
+ output_folder: str = OUTPUT_FOLDER,
4913
+ text_extraction_method: str = None,
4914
+ visualisation_folder: str = None,
4915
+ add_legend: bool = True,
4916
+ log_files_output_paths: List[str] = [],
4917
+ ) -> None:
4918
+ """
4919
+ Visualizes OCR bounding boxes with confidence-based colors and a legend.
4920
+ Handles word-level OCR results from Textract and Tesseract.
4921
+
4922
+ Args:
4923
+ image: The PIL Image object
4924
+ ocr_results: Dictionary containing word-level OCR results
4925
+ image_name: Optional name for the saved image file
4926
+ output_folder: Output folder path
4927
+ text_extraction_method: The text extraction method being used (determines folder name)
4928
+ visualisation_folder: Subfolder name for visualizations (auto-determined if not provided)
4929
+ add_legend: Whether to add a legend to the visualization
4930
+ log_files_output_paths: List of file paths used for saving redaction process logging results.
4931
+ """
4932
+ # Determine visualization folder based on text extraction method
4933
+ if visualisation_folder is None:
4934
+ if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
4935
+ visualisation_folder = "textract_visualisations"
4936
+ elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
4937
+ visualisation_folder = "tesseract_visualisations"
4938
+ else:
4939
+ visualisation_folder = "ocr_visualisations"
4940
+ if not ocr_results:
4941
+ return
4942
+
4943
+ # Convert PIL image to OpenCV format
4944
+ image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
4945
+
4946
+ # Get image dimensions
4947
+ height, width = image_cv.shape[:2]
4948
+
4949
+ # Define confidence ranges and colors for bounding boxes (bright colors)
4950
+ confidence_ranges = [
4951
+ (80, 100, (0, 255, 0), "High (80-100%)"), # Green
4952
+ (50, 79, (0, 165, 255), "Medium (50-79%)"), # Orange
4953
+ (0, 49, (0, 0, 255), "Low (0-49%)"), # Red
4954
+ ]
4955
+
4956
+ # Define darker colors for text on white background
4957
+ text_confidence_ranges = [
4958
+ (80, 100, (0, 150, 0), "High (80-100%)"), # Dark Green
4959
+ (50, 79, (0, 100, 200), "Medium (50-79%)"), # Dark Orange
4960
+ (0, 49, (0, 0, 180), "Low (0-49%)"), # Dark Red
4961
+ ]
4962
+
4963
+ # Process each line's words
4964
+ for line_key, line_data in ocr_results.items():
4965
+ if not isinstance(line_data, dict) or 'words' not in line_data:
4966
+ continue
4967
+
4968
+ words = line_data.get('words', [])
4969
+
4970
+ # Process each word in the line
4971
+ for word_data in words:
4972
+ if not isinstance(word_data, dict):
4973
+ continue
4974
+
4975
+ text = word_data.get('text', '')
4976
+ # Handle both 'conf' and 'confidence' field names for compatibility
4977
+ conf = int(word_data.get('conf', word_data.get('confidence', 0)))
4978
+
4979
+ # Skip empty text or invalid confidence
4980
+ if not text.strip() or conf == -1:
4981
+ continue
4982
+
4983
+ # Get bounding box coordinates
4984
+ bbox = word_data.get('bounding_box', (0, 0, 0, 0))
4985
+ if len(bbox) != 4:
4986
+ continue
4987
+
4988
+ x1, y1, x2, y2 = bbox
4989
+
4990
+ # Ensure coordinates are within image bounds
4991
+ x1 = max(0, min(int(x1), width))
4992
+ y1 = max(0, min(int(y1), height))
4993
+ x2 = max(0, min(int(x2), width))
4994
+ y2 = max(0, min(int(y2), height))
4995
+
4996
+ # Skip if bounding box is invalid
4997
+ if x2 <= x1 or y2 <= y1:
4998
+ continue
4999
+
5000
+ # Check if word was replaced by a different model
5001
+ model = word_data.get('model', None)
5002
+ is_replaced = model and model != "Tesseract"
5003
+
5004
+ # Determine bounding box color: grey for replaced words, otherwise based on confidence
5005
+ # if is_replaced:
5006
+ # box_color = (128, 128, 128) # Grey for model replacements (bounding box only)
5007
+ # else:
5008
+ box_color = (0, 0, 255) # Default to red
5009
+ for min_conf, max_conf, conf_color, _ in confidence_ranges:
5010
+ if min_conf <= conf <= max_conf:
5011
+ box_color = conf_color
5012
+ break
5013
+
5014
+ # Draw bounding box
5015
+ cv2.rectangle(image_cv, (x1, y1), (x2, y2), box_color, 1)
5016
+
5017
+ # Add legend
5018
+ if add_legend:
5019
+ add_confidence_legend(image_cv, confidence_ranges, show_model_replacement=True)
5020
+
5021
+ # Create second page with text overlay
5022
+ text_page = np.ones((height, width, 3), dtype=np.uint8) * 255 # White background
5023
+
5024
+ # Process each line's words for text overlay
5025
+ for line_key, line_data in ocr_results.items():
5026
+ if not isinstance(line_data, dict) or 'words' not in line_data:
5027
+ continue
5028
+
5029
+ words = line_data.get('words', [])
5030
+
5031
+ # Process each word in the line
5032
+ for word_data in words:
5033
+ if not isinstance(word_data, dict):
5034
+ continue
5035
+
5036
+ text = word_data.get('text', '')
5037
+ # Handle both 'conf' and 'confidence' field names for compatibility
5038
+ conf = int(word_data.get('conf', word_data.get('confidence', 0)))
5039
+
5040
+ # Skip empty text or invalid confidence
5041
+ if not text.strip() or conf == -1:
5042
+ continue
5043
+
5044
+ # Get bounding box coordinates
5045
+ bbox = word_data.get('bounding_box', (0, 0, 0, 0))
5046
+ if len(bbox) != 4:
5047
+ continue
5048
+
5049
+ x1, y1, x2, y2 = bbox
5050
+
5051
+ # Ensure coordinates are within image bounds
5052
+ x1 = max(0, min(int(x1), width))
5053
+ y1 = max(0, min(int(y1), height))
5054
+ x2 = max(0, min(int(x2), width))
5055
+ y2 = max(0, min(int(y2), height))
5056
+
5057
+ # Skip if bounding box is invalid
5058
+ if x2 <= x1 or y2 <= y1:
5059
+ continue
5060
+
5061
+ # Check if word was replaced by a different model (for reference, but text color always uses confidence)
5062
+ model = word_data.get('model', None)
5063
+ is_replaced = model and model != "Tesseract"
5064
+
5065
+ # Text color always based on confidence (not affected by model replacement)
5066
+ text_color = (0, 0, 180) # Default to dark red
5067
+ for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
5068
+ if min_conf <= conf <= max_conf:
5069
+ text_color = conf_color
5070
+ break
5071
+
5072
+ # Calculate font size to fit text within bounding box
5073
+ box_width = x2 - x1
5074
+ box_height = y2 - y1
5075
+
5076
+ # Start with a reasonable font scale
5077
+ font_scale = 0.5
5078
+ font_thickness = 1
5079
+ font = cv2.FONT_HERSHEY_SIMPLEX
5080
+
5081
+ # Get text size and adjust to fit
5082
+ (text_width, text_height), baseline = cv2.getTextSize(
5083
+ text, font, font_scale, font_thickness
5084
+ )
5085
+
5086
+ # Scale font to fit width (with some padding)
5087
+ if text_width > 0:
5088
+ width_scale = (box_width * 0.9) / text_width
5089
+ else:
5090
+ width_scale = 1.0
5091
+
5092
+ # Scale font to fit height (with some padding)
5093
+ if text_height > 0:
5094
+ height_scale = (box_height * 0.8) / text_height
5095
+ else:
5096
+ height_scale = 1.0
5097
+
5098
+ # Use the smaller scale to ensure text fits both dimensions
5099
+ font_scale = min(font_scale * min(width_scale, height_scale), 2.0) # Cap at 2.0
5100
+
5101
+ # Recalculate text size with adjusted font scale
5102
+ (text_width, text_height), baseline = cv2.getTextSize(
5103
+ text, font, font_scale, font_thickness
5104
+ )
5105
+
5106
+ # Center text within bounding box
5107
+ text_x = x1 + (box_width - text_width) // 2
5108
+ text_y = y1 + (box_height + text_height) // 2 # Baseline adjustment
5109
+
5110
+ # Draw text
5111
+ cv2.putText(
5112
+ text_page,
5113
+ text,
5114
+ (text_x, text_y),
5115
+ font,
5116
+ font_scale,
5117
+ text_color,
5118
+ font_thickness,
5119
+ cv2.LINE_AA
5120
+ )
5121
+
5122
+ # Draw grey bounding box for replaced words on text page
5123
+ if is_replaced:
5124
+ box_color = (128, 128, 128) # Grey for model replacements
5125
+ cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
5126
+
5127
+ # Add legend to second page
5128
+ if add_legend:
5129
+ add_confidence_legend(text_page, text_confidence_ranges, show_model_replacement=True)
5130
+
5131
+ # Concatenate images horizontally
5132
+ combined_image = np.hstack([image_cv, text_page])
5133
+
5134
+ # Save the visualization
5135
+ if output_folder:
5136
+ textract_viz_folder = os.path.join(output_folder, visualisation_folder)
5137
+
5138
+ # Double-check the constructed path is safe
5139
+ if not validate_folder_containment(textract_viz_folder, OUTPUT_FOLDER):
5140
+ raise ValueError(
5141
+ f"Unsafe textract visualisations folder path: {textract_viz_folder}"
5142
+ )
5143
+
5144
+ os.makedirs(textract_viz_folder, exist_ok=True)
5145
+
5146
+ # Generate filename
5147
+ if image_name:
5148
+ # Remove file extension if present
5149
+ base_name = os.path.splitext(image_name)[0]
5150
+ filename = f"{base_name}_{visualisation_folder}.jpg"
5151
+ else:
5152
+ timestamp = int(time.time())
5153
+ filename = f"{visualisation_folder}_{timestamp}.jpg"
5154
+
5155
+ output_path = os.path.join(textract_viz_folder, filename)
5156
+
5157
+ # Save the combined image
5158
+ cv2.imwrite(output_path, combined_image)
5159
+ print(f"OCR visualization saved to: {output_path}")
5160
+
5161
+ log_files_output_paths.append(output_path)
5162
+
5163
+ return log_files_output_paths
5164
+
5165
+
5166
+ def add_confidence_legend(
5167
+ image_cv: np.ndarray,
5168
+ confidence_ranges: List[Tuple],
5169
+ show_model_replacement: bool = False
5170
+ ) -> None:
5171
+ """
5172
+ Adds a confidence legend to the visualization image.
5173
+
5174
+ Args:
5175
+ image_cv: OpenCV image array
5176
+ confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
5177
+ show_model_replacement: Whether to include a legend entry for model replacements (grey)
5178
+ """
5179
+ height, width = image_cv.shape[:2]
5180
+
5181
+ # Calculate legend height based on number of items
5182
+ num_items = len(confidence_ranges)
5183
+ if show_model_replacement:
5184
+ num_items += 1 # Add one more for model replacement entry
5185
+
5186
+ # Legend parameters
5187
+ legend_width = 200
5188
+ legend_height = 80 + (num_items * 25) # Dynamic height based on number of items
5189
+ legend_x = width - legend_width - 20
5190
+ legend_y = 20
5191
+
5192
+ # Draw legend background
5193
+ cv2.rectangle(
5194
+ image_cv,
5195
+ (legend_x, legend_y),
5196
+ (legend_x + legend_width, legend_y + legend_height),
5197
+ (255, 255, 255), # White background
5198
+ -1,
5199
+ )
5200
+ cv2.rectangle(
5201
+ image_cv,
5202
+ (legend_x, legend_y),
5203
+ (legend_x + legend_width, legend_y + legend_height),
5204
+ (0, 0, 0), # Black border
5205
+ 2,
5206
+ )
5207
+
5208
+ # Add title
5209
+ title_text = "Confidence Levels"
5210
+ font_scale = 0.6
5211
+ font_thickness = 2
5212
+ (title_width, title_height), _ = cv2.getTextSize(
5213
+ title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
5214
+ )
5215
+ title_x = legend_x + (legend_width - title_width) // 2
5216
+ title_y = legend_y + title_height + 10
5217
+ cv2.putText(
5218
+ image_cv,
5219
+ title_text,
5220
+ (title_x, title_y),
5221
+ cv2.FONT_HERSHEY_SIMPLEX,
5222
+ font_scale,
5223
+ (0, 0, 0), # Black text
5224
+ font_thickness,
5225
+ )
5226
+
5227
+ # Add confidence range items
5228
+ item_spacing = 25
5229
+ start_y = title_y + 25
5230
+ item_index = 0
5231
+
5232
+ # Add model replacement entry first if enabled
5233
+ if show_model_replacement:
5234
+ item_y = start_y + item_index * item_spacing
5235
+ item_index += 1
5236
+
5237
+ # Draw grey color box
5238
+ box_size = 15
5239
+ box_x = legend_x + 10
5240
+ box_y = item_y - box_size
5241
+ replacement_color = (128, 128, 128) # Grey in BGR
5242
+ cv2.rectangle(
5243
+ image_cv, (box_x, box_y), (box_x + box_size, box_y + box_size), replacement_color, -1
5244
+ )
5245
+ cv2.rectangle(
5246
+ image_cv,
5247
+ (box_x, box_y),
5248
+ (box_x + box_size, box_y + box_size),
5249
+ (0, 0, 0), # Black border
5250
+ 1,
5251
+ )
5252
+
5253
+ # Add label text
5254
+ label_x = box_x + box_size + 10
5255
+ label_y = item_y - 5
5256
+ cv2.putText(
5257
+ image_cv,
5258
+ "Model Replacement",
5259
+ (label_x, label_y),
5260
+ cv2.FONT_HERSHEY_SIMPLEX,
5261
+ 0.5,
5262
+ (0, 0, 0), # Black text
5263
+ 1,
5264
+ )
5265
+
5266
+ # Add confidence range items
5267
+ for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
5268
+ item_y = start_y + (item_index + i) * item_spacing
5269
+
5270
+ # Draw color box
5271
+ box_size = 15
5272
+ box_x = legend_x + 10
5273
+ box_y = item_y - box_size
5274
+ cv2.rectangle(
5275
+ image_cv, (box_x, box_y), (box_x + box_size, box_y + box_size), color, -1
5276
+ )
5277
+ cv2.rectangle(
5278
+ image_cv,
5279
+ (box_x, box_y),
5280
+ (box_x + box_size, box_y + box_size),
5281
+ (0, 0, 0), # Black border
5282
+ 1,
5283
+ )
5284
+
5285
+ # Add label text
5286
+ label_x = box_x + box_size + 10
5287
+ label_y = item_y - 5
5288
+ cv2.putText(
5289
+ image_cv,
5290
+ label,
5291
+ (label_x, label_y),
5292
+ cv2.FONT_HERSHEY_SIMPLEX,
5293
+ 0.5,
5294
+ (0, 0, 0), # Black text
5295
+ 1,
5296
+ )
5297
+
tools/run_vlm.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import time
4
+ from threading import Thread
5
+
6
+ import spaces
7
+ from PIL import Image
8
+
9
+ from tools.config import SHOW_VLM_MODEL_OPTIONS, MAX_SPACES_GPU_RUN_TIME
10
+
11
+ if SHOW_VLM_MODEL_OPTIONS is True:
12
+ import torch
13
+ from huggingface_hub import snapshot_download
14
+ from transformers import (
15
+ AutoModelForCausalLM,
16
+ AutoProcessor,
17
+ Qwen2_5_VLForConditionalGeneration,
18
+ Qwen3VLForConditionalGeneration,
19
+ TextIteratorStreamer,
20
+ )
21
+
22
+ from tools.config import (
23
+ SELECTED_MODEL,
24
+ USE_FLASH_ATTENTION,
25
+ MODEL_CACHE_PATH,
26
+ )
27
+
28
+ # Configuration: Choose which vision model to load
29
+ # Options: "olmOCR-2-7B-1025", "Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR"
30
+ # SELECTED_MODEL = os.getenv("VISION_MODEL", "Dots.OCR")
31
+
32
+ # This code is uses significant amounts of code from the Hugging Face space here: https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR3 . Thanks!
33
+
34
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
35
+
36
+ print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
37
+ print("torch.__version__ =", torch.__version__)
38
+ print("torch.version.cuda =", torch.version.cuda)
39
+ print("cuda available:", torch.cuda.is_available())
40
+ print("cuda device count:", torch.cuda.device_count())
41
+ if torch.cuda.is_available():
42
+ print("current device:", torch.cuda.current_device())
43
+ print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
44
+
45
+ print("Using device:", device)
46
+
47
+ CACHE_PATH = MODEL_CACHE_PATH
48
+ if not os.path.exists(CACHE_PATH):
49
+ os.makedirs(CACHE_PATH)
50
+
51
+ # Initialize model and processor variables
52
+ processor = None
53
+ model = None
54
+
55
+ print(f"Loading vision model: {SELECTED_MODEL}")
56
+
57
+ # Load only the selected model based on configuration
58
+ if SELECTED_MODEL == "olmOCR-2-7B-1025":
59
+ MODEL_ID = "allenai/olmOCR-2-7B-1025"
60
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
61
+ model = (
62
+ Qwen2_5_VLForConditionalGeneration.from_pretrained(
63
+ MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
64
+ )
65
+ .to(device)
66
+ .eval()
67
+ )
68
+
69
+ elif SELECTED_MODEL == "Nanonets-OCR2-3B":
70
+ MODEL_ID = "nanonets/Nanonets-OCR2-3B"
71
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
72
+ model = (
73
+ Qwen2_5_VLForConditionalGeneration.from_pretrained(
74
+ MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
75
+ )
76
+ .to(device)
77
+ .eval()
78
+ )
79
+
80
+ elif SELECTED_MODEL == "Chandra-OCR":
81
+ MODEL_ID = "datalab-to/chandra"
82
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
83
+ model = (
84
+ Qwen3VLForConditionalGeneration.from_pretrained(
85
+ MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
86
+ )
87
+ .to(device)
88
+ .eval()
89
+ )
90
+
91
+ elif SELECTED_MODEL == "Dots.OCR":
92
+ # Download and patch Dots.OCR model
93
+ model_path_d_local = snapshot_download(
94
+ repo_id="rednote-hilab/dots.ocr",
95
+ local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
96
+ max_workers=20,
97
+ local_dir_use_symlinks=False,
98
+ )
99
+
100
+ config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
101
+
102
+ if os.path.exists(config_file_path):
103
+ with open(config_file_path, "r") as f:
104
+ input_code = f.read()
105
+
106
+ lines = input_code.splitlines()
107
+ if "class DotsVLProcessor" in input_code and not any(
108
+ "attributes = " in line for line in lines
109
+ ):
110
+ output_lines = []
111
+ for line in lines:
112
+ output_lines.append(line)
113
+ if line.strip().startswith("class DotsVLProcessor"):
114
+ output_lines.append(
115
+ ' attributes = ["image_processor", "tokenizer"]'
116
+ )
117
+
118
+ with open(config_file_path, "w") as f:
119
+ f.write("\n".join(output_lines))
120
+ print("Patched configuration_dots.py successfully.")
121
+
122
+ sys.path.append(model_path_d_local)
123
+
124
+ if USE_FLASH_ATTENTION is True:
125
+ attn_implementation = "flash_attention_2"
126
+ else:
127
+ attn_implementation = "eager"
128
+
129
+ MODEL_ID = model_path_d_local
130
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
131
+ model = AutoModelForCausalLM.from_pretrained(
132
+ MODEL_ID,
133
+ attn_implementation=attn_implementation,
134
+ torch_dtype=torch.bfloat16,
135
+ device_map="auto",
136
+ trust_remote_code=True,
137
+ ).eval()
138
+
139
+ else:
140
+ raise ValueError(
141
+ f"Invalid model selected: {SELECTED_MODEL}. Valid options are: olmOCR-2-7B-1025, Nanonets-OCR2-3B, Chandra-OCR, Dots.OCR"
142
+ )
143
+
144
+ print(f"Successfully loaded {SELECTED_MODEL}")
145
+
146
+
147
+ @spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
148
+ def generate_image(
149
+ text: str,
150
+ image: Image.Image,
151
+ max_new_tokens: int,
152
+ temperature: float,
153
+ top_p: float,
154
+ top_k: int,
155
+ repetition_penalty: float,
156
+ ):
157
+ """
158
+ Generates responses using the configured vision model for image input.
159
+ Streams text to console and returns complete text only at the end.
160
+ """
161
+ if image is None:
162
+ return "Please upload an image."
163
+
164
+ messages = [
165
+ {
166
+ "role": "user",
167
+ "content": [
168
+ {"type": "image"},
169
+ {"type": "text", "text": text},
170
+ ],
171
+ }
172
+ ]
173
+ prompt_full = processor.apply_chat_template(
174
+ messages, tokenize=False, add_generation_prompt=True
175
+ )
176
+
177
+ inputs = processor(
178
+ text=[prompt_full], images=[image], return_tensors="pt", padding=True
179
+ ).to(device)
180
+
181
+ streamer = TextIteratorStreamer(
182
+ processor, skip_prompt=True, skip_special_tokens=True
183
+ )
184
+ generation_kwargs = {
185
+ **inputs,
186
+ "streamer": streamer,
187
+ "max_new_tokens": max_new_tokens,
188
+ "do_sample": True,
189
+ "temperature": temperature,
190
+ "top_p": top_p,
191
+ "top_k": top_k,
192
+ "repetition_penalty": repetition_penalty,
193
+ }
194
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
195
+ thread.start()
196
+
197
+ buffer = ""
198
+ for new_text in streamer:
199
+ buffer += new_text
200
+ buffer = buffer.replace("<|im_end|>", "")
201
+
202
+ # Print to console as it streams
203
+ print(new_text, end="", flush=True)
204
+
205
+ time.sleep(0.01)
206
+
207
+ # Print final newline after streaming is complete
208
+ print() # Add newline at the end
209
+
210
+ # Return the complete text only at the end
211
+ return buffer