Spaces:

seanpedrickcase
/

document_redaction

Running

App Files Files Community

seanpedrickcase commited on 13 days ago

Commit

5e01004

1 Parent(s): 2f34683

Initial commit for VLM support. Created visualisations for OCR output. Corrected log_file_output_paths reference.

Browse files

Files changed (14) hide show

.dockerignore +1 -0
.gitignore +1 -0
Dockerfile +8 -3
README.md +2 -2
app.py +25 -21
cli_redact.py +1 -1
requirements.txt +12 -4
requirements_lightweight.txt +38 -0
src/app_settings.qmd +5 -5
src/user_guide.qmd +1 -1
tools/config.py +73 -5
tools/custom_image_analyser_engine.py +638 -124
tools/file_redaction.py +446 -18
tools/run_vlm.py +211 -0

.dockerignore CHANGED Viewed

@@ -34,3 +34,4 @@ test/output/*
 test/tmp/*
 test/usage/*
 .ruff_cache/*

 test/tmp/*
 test/usage/*
 .ruff_cache/*
+model_cache/*

.gitignore CHANGED Viewed

@@ -37,3 +37,4 @@ test/output/*
 test/tmp/*
 test/usage/*
 .ruff_cache/*

 test/tmp/*
 test/usage/*
 .ruff_cache/*
+model_cache/*

Dockerfile CHANGED Viewed

@@ -16,11 +16,11 @@ RUN apt-get update \
 WORKDIR /src
-COPY requirements.txt .
-RUN pip install --verbose --no-cache-dir --target=/install -r requirements.txt && rm requirements.txt
-# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. See requirements.txt for more details, including installing the GPU version of PaddleOCR.
 ARG INSTALL_PADDLEOCR=False
 ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
@@ -28,6 +28,11 @@ RUN if [ "$INSTALL_PADDLEOCR" = "True" ]; then \
     pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0 paddlepaddle==3.2.0; \
 fi
 # ===================================================================
 # Stage 2: A common 'base' for both Lambda and Gradio
 # ===================================================================

 WORKDIR /src
+COPY requirements_lightweight.txt .
+RUN pip install --verbose --no-cache-dir --target=/install -r requirements_lightweight.txt && rm requirements_lightweight.txt
+# Optionally install PaddleOCR if the INSTALL_PADDLEOCR environment variable is set to True. See requirements_lightweight.txt for more details, including installing the GPU version of PaddleOCR.
 ARG INSTALL_PADDLEOCR=False
 ENV INSTALL_PADDLEOCR=${INSTALL_PADDLEOCR}
     pip install --verbose --no-cache-dir --target=/install paddleocr==3.3.0 paddlepaddle==3.2.0; \
 fi
+RUN if [ "$INSTALL_VLM" = "True" ]; then \
+    pip install --verbose --no-cache-dir --target=/install torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/cu126; \
+    pip install --verbose --no-cache-dir --target=/install transformers==4.57.1 accelerate==1.11.0 bitsandbytes==0.48.1; \
+fi
 # ===================================================================
 # Stage 2: A common 'base' for both Lambda and Gradio
 # ===================================================================

README.md CHANGED Viewed

@@ -162,7 +162,7 @@ These settings are useful for all users, regardless of whether you are using AWS
     *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
 *   `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
-    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
 *   `SESSION_OUTPUT_FOLDER=False`
     *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
@@ -922,7 +922,7 @@ The hybrid OCR mode uses several configurable parameters:
 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
-- **SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES** (default: False): Save comparison images when using hybrid mode
 - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

     *   Set to `True` to display a language selection dropdown in the UI for OCR processing.
 *   `CHOSEN_LOCAL_OCR_MODEL=tesseract`"
+    *   Choose the backend for local OCR. Options are `tesseract`, `paddle`, or `hybrid`. "Tesseract" is the default, and is recommended. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with PaddleOCR on words with low confidence. "paddle" will only return whole line text extraction, and so will only work for OCR, not redaction.
 *   `SESSION_OUTPUT_FOLDER=False`
     *   If `True`, redacted files will be saved in unique subfolders within the `output/` directory for each session.
 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
+- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
 - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

app.py CHANGED Viewed

@@ -1242,16 +1242,7 @@ with blocks:
                 label=f"Change default redaction settings.{default_text}{textract_text}{comprehend_text}{open_tab_text}".strip(),
                 open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
             ):
-                text_extract_method_radio.render()
-                if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
-                    with gr.Accordion(
-                        "Enable AWS Textract signature detection (default is off)",
-                        open=False,
-                    ):
-                        handwrite_signature_checkbox.render()
-                else:
-                    handwrite_signature_checkbox.render()
                 if SHOW_LOCAL_OCR_MODEL_OPTIONS:
                     with gr.Accordion(
@@ -1259,7 +1250,7 @@ with blocks:
                         open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
                     ):
                         local_ocr_method_radio = gr.Radio(
-                            label="""Choose local OCR model. "tesseract" is the default and will work for most documents. "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.""",
                             value=CHOSEN_LOCAL_OCR_MODEL,
                             choices=LOCAL_OCR_MODEL_OPTIONS,
                             interactive=True,
@@ -1274,6 +1265,15 @@ with blocks:
                         visible=False,
                     )
                 with gr.Row(equal_height=True):
                     pii_identification_method_drop.render()
@@ -1378,16 +1378,20 @@ with blocks:
                     with gr.Row(equal_height=False):
                         with gr.Column(scale=2):
                             textract_job_detail_df = gr.Dataframe(
-                                label="Previous job details",
-                                visible=True,
-                                type="pandas",
-                                wrap=True,
-                                interactive=True,
-                                row_count=(0, "fixed"),
-                                col_count=(5, "fixed"),
-                                static_columns=[0, 1, 2, 3, 4],
-                                max_height=400,
-                            )
                         with gr.Column(scale=1):
                             job_id_textbox = gr.Textbox(
                                 label="Job ID to check status",

                 label=f"Change default redaction settings.{default_text}{textract_text}{comprehend_text}{open_tab_text}".strip(),
                 open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
             ):
+                text_extract_method_radio.render()
                 if SHOW_LOCAL_OCR_MODEL_OPTIONS:
                     with gr.Accordion(
                         open=EXTRACTION_AND_PII_OPTIONS_OPEN_BY_DEFAULT,
                     ):
                         local_ocr_method_radio = gr.Radio(
+                            label="""Choose local OCR model. "tesseract" is the default and will work for most documents. "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence. "hybrid-vlm" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen vision model (default Dots.OCR) on words with low confidence.""",
                             value=CHOSEN_LOCAL_OCR_MODEL,
                             choices=LOCAL_OCR_MODEL_OPTIONS,
                             interactive=True,
                         visible=False,
                     )
+                if SHOW_AWS_TEXT_EXTRACTION_OPTIONS:
+                    with gr.Accordion(
+                        "Enable AWS Textract signature detection (default is off)",
+                        open=False,
+                    ):
+                        handwrite_signature_checkbox.render()
+                else:
+                    handwrite_signature_checkbox.render()
                 with gr.Row(equal_height=True):
                     pii_identification_method_drop.render()
                     with gr.Row(equal_height=False):
                         with gr.Column(scale=2):
                             textract_job_detail_df = gr.Dataframe(
+                            pd.DataFrame(
+                                columns=[
+                                    "job_id",
+                                    "file_name",
+                                    "job_type",
+                                    "signature_extraction",
+                                    "job_date_time",
+                                ]
+                            ),
+                            label="Previous job details",
+                            visible=True,
+                            type="pandas",
+                            wrap=True,
+                        )
                         with gr.Column(scale=1):
                             job_id_textbox = gr.Textbox(
                                 label="Job ID to check status",

cli_redact.py CHANGED Viewed

@@ -399,7 +399,7 @@ python cli_redact.py --task textract --textract_action list
     )
     pdf_group.add_argument(
         "--chosen_local_ocr_model",
-        choices=["tesseract", "hybrid", "paddle"],
         default=CHOSEN_LOCAL_OCR_MODEL,
         help="Local OCR model to use.",
     )

     )
     pdf_group.add_argument(
         "--chosen_local_ocr_model",
+        choices=["tesseract", "hybrid-paddle", "paddle"],
         default=CHOSEN_LOCAL_OCR_MODEL,
         help="Local OCR model to use.",
     )

requirements.txt CHANGED Viewed

@@ -24,13 +24,21 @@ python-dotenv==1.0.1
 awslambdaric==3.1.1
 python-docx==1.2.0
 defusedxml==0.7.1
-# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
-# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
-# paddleocr==3.3.0
 # Test dependencies
 pytest>=7.0.0
 pytest-cov>=4.0.0

 awslambdaric==3.1.1
 python-docx==1.2.0
 defusedxml==0.7.1
 # Test dependencies
 pytest>=7.0.0
 pytest-cov>=4.0.0
+spaces==0.42.1
+# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
+# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
+# paddleocr==3.3.0
+For running VLMs
+torch==2.6.0 torchvision --index-url https://download.pytorch.org/whl/cu126
+transformers==4.57.1
+accelerate==1.11.0
+bitsandbytes==0.48.1
+flash-attn==2.8.3 # Only compatible with Linux systems

requirements_lightweight.txt ADDED Viewed

	@@ -0,0 +1,38 @@

+pdfminer.six==20250506
+pdf2image==1.17.0
+pymupdf==1.26.4
+opencv-python==4.12.0.88
+presidio_analyzer==2.2.360
+presidio_anonymizer==2.2.360
+presidio-image-redactor==0.0.57
+pikepdf==9.11.0
+pandas==2.3.3
+scikit-learn==1.7.2
+spacy==3.8.7
+en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0.tar.gz
+gradio==5.49.1
+polars==1.33.1
+boto3==1.40.57
+pyarrow==21.0.0
+openpyxl==3.1.5
+Faker==37.8.0
+python-levenshtein==0.27.1
+spaczz==0.6.1
+https://github.com/seanpedrick-case/gradio_image_annotator/releases/download/v0.3.3/gradio_image_annotation-0.3.3-py3-none-any.whl # This version includes rotation, image zoom, and default labels, as well as the option to include id for annotation boxes
+rapidfuzz==3.14.1
+python-dotenv==1.0.1
+awslambdaric==3.1.1
+python-docx==1.2.0
+defusedxml==0.7.1
+# Test dependencies
+pytest>=7.0.0
+pytest-cov>=4.0.0
+spaces==0.42.1
+# Optional: uncomment the below to install paddleOCR if you want to use hybrid text extraction (tesseract plus paddleocr)
+# paddlepaddle==3.2.0 # Consider installing the GPU version for faster local OCR inference with PaddleOCR: paddlepaddle-gpu==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/ , compatible with CUDA 12.6. See this for more details: https://www.paddlepaddle.org.cn/documentation/docs/en/install/pip/linux-pip_en.html#span-id-gpu-gpu-version-of-paddlepaddle-span
+# paddleocr==3.3.0

src/app_settings.qmd CHANGED Viewed

@@ -300,7 +300,7 @@ Configurations related to text extraction, PII detection, and the redaction proc
 ### Local OCR (Tesseract & PaddleOCR)
 * **`CHOSEN_LOCAL_OCR_MODEL`**
-    * **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid"`.
     * **Default Value:** `"tesseract"`
 * **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
@@ -308,11 +308,11 @@ Configurations related to text extraction, PII detection, and the redaction proc
     * **Default Value:** `"False"`
 * **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
-    * **Description:** In "hybrid" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
     * **Default Value:** `65`
 * **`HYBRID_OCR_PADDING`**
-    * **Description:** In "hybrid" mode, padding added to the word's bounding box before re-extraction.
     * **Default Value:** `1`
 * **`PADDLE_USE_TEXTLINE_ORIENTATION`**
@@ -323,8 +323,8 @@ Configurations related to text extraction, PII detection, and the redaction proc
     * **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
     * **Default Value:** `1.2`
-* **`SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES`**
-    * **Description:** Saves comparison images when using "hybrid" OCR mode.
     * **Default Value:** `"False"`
 * **`SAVE_PADDLE_VISUALISATIONS`**

 ### Local OCR (Tesseract & PaddleOCR)
 * **`CHOSEN_LOCAL_OCR_MODEL`**
+    * **Description:** Choose the engine for local OCR: `"tesseract"`, `"paddle"`, or `"hybrid-paddle"`.
     * **Default Value:** `"tesseract"`
 * **`SHOW_LOCAL_OCR_MODEL_OPTIONS`**
     * **Default Value:** `"False"`
 * **`HYBRID_OCR_CONFIDENCE_THRESHOLD`**
+    * **Description:** In "hybrid-paddle" mode, this is the Tesseract confidence score below which PaddleOCR will be used for re-extraction.
     * **Default Value:** `65`
 * **`HYBRID_OCR_PADDING`**
+    * **Description:** In "hybrid-paddle" mode, padding added to the word's bounding box before re-extraction.
     * **Default Value:** `1`
 * **`PADDLE_USE_TEXTLINE_ORIENTATION`**
     * **Description:** Controls the expansion ratio of the detected text region in PaddleOCR.
     * **Default Value:** `1.2`
+* **`SAVE_EXAMPLE_HYBRID_IMAGES`**
+    * **Description:** Saves comparison images when using "hybrid-paddle" OCR mode.
     * **Default Value:** `"False"`
 * **`SAVE_PADDLE_VISUALISATIONS`**

src/user_guide.qmd CHANGED Viewed

@@ -721,7 +721,7 @@ The hybrid OCR mode uses several configurable parameters:
 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
-- **SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES** (default: False): Save comparison images when using hybrid mode
 - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

 - **HYBRID_OCR_CONFIDENCE_THRESHOLD** (default: 65): Tesseract confidence score below which PaddleOCR will be used for re-extraction
 - **HYBRID_OCR_PADDING** (default: 1): Padding added to word bounding boxes before re-extraction
+- **SAVE_EXAMPLE_HYBRID_IMAGES** (default: False): Save comparison images when using hybrid mode
 - **SAVE_PADDLE_VISUALISATIONS** (default: False): Save images with PaddleOCR bounding boxes overlaid
 ### When to use different OCR models

tools/config.py CHANGED Viewed

@@ -437,10 +437,54 @@ DEFAULT_TABULAR_ANONYMISATION_STRATEGY = get_or_create_env_var(
     "DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
 )
 ### Local OCR model - Tesseract vs PaddleOCR
 CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
     "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
-)  # Choose between "tesseract", "hybrid", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
 SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
     get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
@@ -448,12 +492,19 @@ SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
 if SHOW_LOCAL_OCR_MODEL_OPTIONS:
     LOCAL_OCR_MODEL_OPTIONS = [
         "tesseract",
-        "hybrid",
         "paddle",
     ]
 else:
     LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
 HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
     get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
 )  # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
@@ -461,6 +512,14 @@ HYBRID_OCR_PADDING = int(
     get_or_create_env_var("HYBRID_OCR_PADDING", "1")
 )  # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
 PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
     get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
 )
@@ -469,14 +528,22 @@ PADDLE_DET_DB_UNCLIP_RATIO = float(
     get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
 )
-SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES = convert_string_to_boolean(
-    get_or_create_env_var("SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES", "False")
 )  # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
 SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
     get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
 )  # Whether to save visualisations of PaddleOCR bounding boxes.
 # Model storage paths for Lambda compatibility
 PADDLE_MODEL_PATH = get_or_create_env_var(
     "PADDLE_MODEL_PATH", ""
@@ -487,7 +554,7 @@ SPACY_MODEL_PATH = get_or_create_env_var(
 )  # Directory for spaCy model storage. Uses default location if not set.
 PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
-    "PREPROCESS_LOCAL_OCR_IMAGES", "False"
 )  # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
 # Entities for redaction
@@ -1012,6 +1079,7 @@ DAYS_TO_DISPLAY_WHOLE_DOCUMENT_JOBS = int(
 )  # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
 ###
 # Config vars output format
 ###

     "DEFAULT_TABULAR_ANONYMISATION_STRATEGY", "redact completely"
 )
+###
+# LOCAL OCR MODEL OPTIONS
+###
+### VLM OPTIONS
+SHOW_VLM_MODEL_OPTIONS = convert_string_to_boolean(
+    get_or_create_env_var("SHOW_VLM_MODEL_OPTIONS", "False")
+)  # Whether to show the VLM model options in the UI
+SELECTED_MODEL = get_or_create_env_var(
+    "SELECTED_MODEL", "Dots.OCR"
+)  # Selected vision model. Choose from: "olmOCR-2-7B-1025", "Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR"
+if SHOW_VLM_MODEL_OPTIONS:
+    VLM_MODEL_OPTIONS = [
+        SELECTED_MODEL,
+    ]
+MAX_SPACES_GPU_RUN_TIME = int(
+    get_or_create_env_var("MAX_SPACES_GPU_RUN_TIME", "60")
+)  # Maximum number of seconds to run the GPU on Spaces
+MAX_NEW_TOKENS = int(
+    get_or_create_env_var("MAX_NEW_TOKENS", "30")
+)  # Maximum number of tokens to generate
+DEFAULT_MAX_NEW_TOKENS = int(
+    get_or_create_env_var("DEFAULT_MAX_NEW_TOKENS", "30")
+)  # Default maximum number of tokens to generate
+MAX_INPUT_TOKEN_LENGTH = int(
+    get_or_create_env_var("MAX_INPUT_TOKEN_LENGTH", "4096")
+)  # Maximum number of tokens to input to the VLM
+USE_FLASH_ATTENTION = convert_string_to_boolean(
+    get_or_create_env_var("USE_FLASH_ATTENTION", "False")
+)  # Whether to use flash attention for the VLM
+OVERWRITE_EXISTING_OCR_RESULTS = convert_string_to_boolean(
+    get_or_create_env_var("OVERWRITE_EXISTING_OCR_RESULTS", "False")
+)  # If True, always create new OCR results instead of loading from existing JSON files
 ### Local OCR model - Tesseract vs PaddleOCR
 CHOSEN_LOCAL_OCR_MODEL = get_or_create_env_var(
     "CHOSEN_LOCAL_OCR_MODEL", "tesseract"
+)  # Choose between "tesseract", "hybrid-paddle", and "paddle". "paddle" is accurate for whole line text extraction, but word-level extract is not natively supported, and so word bounding boxes will be inaccurate. "hybrid-paddle" is a combination of the two - first pass through the redactions will be done with Tesseract, and then a second pass will be done with the chosen hybrid model (default PaddleOCR) on words with low confidence.
 SHOW_LOCAL_OCR_MODEL_OPTIONS = convert_string_to_boolean(
     get_or_create_env_var("SHOW_LOCAL_OCR_MODEL_OPTIONS", "False")
 if SHOW_LOCAL_OCR_MODEL_OPTIONS:
     LOCAL_OCR_MODEL_OPTIONS = [
         "tesseract",
+        "hybrid-paddle",
         "paddle",
     ]
 else:
     LOCAL_OCR_MODEL_OPTIONS = ["tesseract"]
+vlm_options = ["hybrid-vlm"]
+if SHOW_VLM_MODEL_OPTIONS:
+    LOCAL_OCR_MODEL_OPTIONS.extend(vlm_options)
+MODEL_CACHE_PATH = get_or_create_env_var("MODEL_CACHE_PATH", "./model_cache")
 HYBRID_OCR_CONFIDENCE_THRESHOLD = int(
     get_or_create_env_var("HYBRID_OCR_CONFIDENCE_THRESHOLD", "65")
 )  # The tesseract confidence threshold under which the text will be passed to PaddleOCR for re-extraction using the hybrid OCR method.
     get_or_create_env_var("HYBRID_OCR_PADDING", "1")
 )  # The padding to add to the text when passing it to PaddleOCR for re-extraction using the hybrid OCR method.
+TESSERACT_SEGMENTATION_LEVEL = get_or_create_env_var(
+    "TESSERACT_SEGMENTATION_LEVEL", "word"
+)  # Tesseract segmentation level: "word" (PSM 11) or "line" (PSM 6)
+CONVERT_LINE_TO_WORD_LEVEL = convert_string_to_boolean(
+    get_or_create_env_var("CONVERT_LINE_TO_WORD_LEVEL", "False")
+)  # Whether to convert line-level OCR results to word-level for better precision
 PADDLE_USE_TEXTLINE_ORIENTATION = convert_string_to_boolean(
     get_or_create_env_var("PADDLE_USE_TEXTLINE_ORIENTATION", "False")
 )
     get_or_create_env_var("PADDLE_DET_DB_UNCLIP_RATIO", "1.2")
 )
+SAVE_EXAMPLE_HYBRID_IMAGES = convert_string_to_boolean(
+    get_or_create_env_var("SAVE_EXAMPLE_HYBRID_IMAGES", "False")
 )  # Whether to save example images of Tesseract vs PaddleOCR re-extraction in hybrid OCR mode.
 SAVE_PADDLE_VISUALISATIONS = convert_string_to_boolean(
     get_or_create_env_var("SAVE_PADDLE_VISUALISATIONS", "False")
 )  # Whether to save visualisations of PaddleOCR bounding boxes.
+SAVE_TESSERACT_VISUALISATIONS = convert_string_to_boolean(
+    get_or_create_env_var("SAVE_TESSERACT_VISUALISATIONS", "False")
+)  # Whether to save visualisations of Tesseract bounding boxes.
+SAVE_TEXTRACT_VISUALISATIONS = convert_string_to_boolean(
+    get_or_create_env_var("SAVE_TEXTRACT_VISUALISATIONS", "False")
+)  # Whether to save visualisations of AWS Textract bounding boxes.
 # Model storage paths for Lambda compatibility
 PADDLE_MODEL_PATH = get_or_create_env_var(
     "PADDLE_MODEL_PATH", ""
 )  # Directory for spaCy model storage. Uses default location if not set.
 PREPROCESS_LOCAL_OCR_IMAGES = get_or_create_env_var(
+    "PREPROCESS_LOCAL_OCR_IMAGES", "True"
 )  # Whether to try and preprocess images before extracting text. NOTE: I have found in testing that this doesn't necessarily imporove results, and greatly slows down extraction.
 # Entities for redaction
 )  # How many days into the past should whole document Textract jobs be displayed? After that, the data is not deleted from the Textract jobs csv, but it is just filtered out. Included to align with S3 buckets where the file outputs will be automatically deleted after X days.
 ###
 # Config vars output format
 ###

tools/custom_image_analyser_engine.py CHANGED Viewed

@@ -17,22 +17,28 @@ from presidio_analyzer import AnalyzerEngine, RecognizerResult
 from tools.config import (
     AWS_PII_OPTION,
     DEFAULT_LANGUAGE,
     HYBRID_OCR_CONFIDENCE_THRESHOLD,
     HYBRID_OCR_PADDING,
     LOCAL_OCR_MODEL_OPTIONS,
     LOCAL_PII_OPTION,
     OUTPUT_FOLDER,
     PADDLE_DET_DB_UNCLIP_RATIO,
     PADDLE_MODEL_PATH,
     PADDLE_USE_TEXTLINE_ORIENTATION,
     PREPROCESS_LOCAL_OCR_IMAGES,
-    SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES,
     SAVE_PADDLE_VISUALISATIONS,
 )
 from tools.helper_functions import clean_unicode_text
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
 from tools.secure_path_utils import validate_folder_containment
 from tools.secure_regex_utils import safe_sanitize_text
@@ -177,6 +183,7 @@ class OCRResult:
     height: int
     conf: float = None
     line: int = None
 @dataclass
@@ -368,30 +375,88 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
     def preprocess_image(
-        self, image: Image.Image, perform_binarization: bool = False
     ) -> Tuple[Image.Image, dict]:
         """
-        A corrected, logical pipeline for OCR preprocessing.
-        Order: Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
-        I have found that binarization is not always helpful with Tesseract, and can sometimes degrade results. So it is off by default.
         """
-        # 1. Convert to greyscale NumPy array
-        image_np = self.convert_image_to_array(image)
-        # 2. Rescale image to optimal DPI (while still greyscale)
         rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
-            image_np
         )
-        # 3. Apply bilateral filtering for noise reduction
         filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
-        # 4. Improve contrast
         adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
-        # 5. Adaptive Thresholding (Binarization) - This is the final step
         if perform_binarization:
             final_image_np, threshold_metadata = (
                 self.adaptive_threshold.preprocess_image(adjusted_image_np)
@@ -404,7 +469,8 @@ class ContrastSegmentedImageEnhancer(ImagePreprocessor):
         final_metadata = {**scale_metadata, **threshold_metadata}
         # Convert final numpy array back to PIL Image for return
-        return Image.fromarray(final_image_np), final_metadata
 def rescale_ocr_data(ocr_data, scale_factor: float):
@@ -447,10 +513,6 @@ def filter_entities_for_language(
         print(f"No entities provided for language: {language}")
         # raise Warning(f"No entities provided for language: {language}")
-    # print("entities:", entities)
-    # print("valid_language_entities:", valid_language_entities)
-    # print("language:", language)
     filtered_entities = [
         entity for entity in entities if entity in valid_language_entities
     ]
@@ -467,6 +529,75 @@ def filter_entities_for_language(
     return filtered_entities
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
@@ -481,9 +612,9 @@ class CustomImageAnalyzerEngine:
         """
         Initializes the CustomImageAnalyzerEngine.
-        :param ocr_engine: The OCR engine to use ("tesseract", "hybrid", or "paddle").
         :param analyzer_engine: The Presidio AnalyzerEngine instance.
-        :param tesseract_config: Configuration string for Tesseract.
         :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
         :param image_preprocessor: Optional image preprocessor.
         :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
@@ -511,7 +642,7 @@ class CustomImageAnalyzerEngine:
             )
         self.output_folder = normalized_output_folder
-        if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid":
             if PaddleOCR is None:
                 raise ImportError(
                     "paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle' in your python environment and retry."
@@ -538,22 +669,39 @@ class CustomImageAnalyzerEngine:
                 paddle_kwargs.setdefault("lang", self.paddle_lang)
             self.paddle_ocr = PaddleOCR(**paddle_kwargs)
         if not analyzer_engine:
             analyzer_engine = AnalyzerEngine()
         self.analyzer_engine = analyzer_engine
-        self.tesseract_config = tesseract_config or "--oem 3 --psm 11"
         if not image_preprocessor:
             image_preprocessor = ContrastSegmentedImageEnhancer()
         self.image_preprocessor = image_preprocessor
-    def _sanitize_filename(self, text: str, max_length: int = 20) -> str:
         """
         Sanitizes text for use in filenames by removing invalid characters and limiting length.
         :param text: The text to sanitize
         :param max_length: Maximum length of the sanitized text
         :return: Sanitized text safe for filenames
         """
@@ -568,7 +716,7 @@ class CustomImageAnalyzerEngine:
         # If empty after sanitization, use a default value
         if not sanitized:
-            sanitized = "text"
         # Limit to max_length characters
         if len(sanitized) > max_length:
@@ -576,8 +724,139 @@ class CustomImageAnalyzerEngine:
             # Ensure we don't end with an underscore if we cut in the middle
             sanitized = sanitized.rstrip("_")
         return sanitized
     def _convert_paddle_to_tesseract_format(
         self, paddle_results: List[Any]
     ) -> Dict[str, List]:
@@ -621,34 +900,207 @@ class CustomImageAnalyzerEngine:
                 line_width = float(max(x_coords) - line_left)
                 line_height = float(max(y_coords) - line_top)
-                # 2. Split the line into words
-                words = line_text.split()
-                if not words:
-                    continue
-                # 3. Estimate bounding box for each word
-                total_chars = len(line_text)
-                # Avoid division by zero for empty lines
-                avg_char_width = line_width / total_chars if total_chars > 0 else 0
-                current_char_offset = 0
-                for word in words:
-                    word_width = float(len(word) * avg_char_width)
-                    word_left = line_left + float(current_char_offset * avg_char_width)
-                    output["text"].append(word)
-                    output["left"].append(word_left)
-                    output["top"].append(line_top)
-                    output["width"].append(word_width)
-                    output["height"].append(line_height)
-                    # Use the line's confidence for each word derived from it
-                    output["conf"].append(int(line_confidence * 100))
-                    # Update offset for the next word (add word length + 1 for the space)
-                    current_char_offset += len(word) + 1
-        return output
     def _perform_hybrid_ocr(
         self,
@@ -659,16 +1111,20 @@ class CustomImageAnalyzerEngine:
         image_name: str = "unknown_image_name",
     ) -> Dict[str, list]:
         """
-        Performs OCR using Tesseract for bounding boxes and PaddleOCR for low-confidence text.
         Returns data in the same dictionary format as pytesseract.image_to_data.
         """
-        if ocr is None:
-            if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
-                ocr = self.paddle_ocr
-            else:
-                raise ValueError(
-                    "No OCR object provided and 'paddle_ocr' is not initialized."
-                )
         print("Starting hybrid OCR process...")
@@ -687,6 +1143,7 @@ class CustomImageAnalyzerEngine:
             "width": list(),
             "height": list(),
             "conf": list(),
         }
         num_words = len(tesseract_data["text"])
@@ -707,6 +1164,9 @@ class CustomImageAnalyzerEngine:
             height = tesseract_data["height"][i]
             # line_number = tesseract_data['abs_line_id'][i]
             # If confidence is low, use PaddleOCR for a second opinion
             if conf < confidence_threshold:
                 img_width, img_height = image.size
@@ -722,82 +1182,90 @@ class CustomImageAnalyzerEngine:
                 cropped_image = image.crop(
                     (crop_left, crop_top, crop_right, crop_bottom)
                 )
-                cropped_image_np = np.array(cropped_image)
-                if len(cropped_image_np.shape) == 2:
-                    cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
-                paddle_results = ocr.predict(cropped_image_np)
-                if paddle_results and paddle_results[0]:
-                    rec_texts = paddle_results[0].get("rec_texts", [])
-                    rec_scores = paddle_results[0].get("rec_scores", [])
-                    if rec_texts and rec_scores:
-                        new_text = " ".join(rec_texts)
-                        new_conf = int(round(np.median(rec_scores) * 100, 0))
-                        # Only replace if Paddle's confidence is better
-                        if new_conf > conf:
-                            print(
-                                f"  Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f})"
-                            )
-                            # For exporting example image comparisons, not used here
-                            safe_text = self._sanitize_filename(text, max_length=20)
-                            self._sanitize_filename(new_text, max_length=20)
-                            if SAVE_EXAMPLE_TESSERACT_VS_PADDLE_IMAGES is True:
-                                # Normalize and validate image_name to prevent path traversal attacks
-                                normalized_image_name = os.path.normpath(image_name)
-                                # Ensure the image name doesn't contain path traversal characters
-                                if (
-                                    ".." in normalized_image_name
-                                    or "/" in normalized_image_name
-                                    or "\\" in normalized_image_name
-                                ):
-                                    normalized_image_name = (
-                                        "safe_image"  # Fallback to safe default
-                                    )
-                                tess_vs_paddle_examples_folder = (
-                                    self.output_folder
-                                    + f"/tess_vs_paddle_examples/{normalized_image_name}"
-                                )
-                                # Validate the constructed path is safe before creating directories
-                                if not validate_folder_containment(
-                                    tess_vs_paddle_examples_folder, OUTPUT_FOLDER
-                                ):
-                                    raise ValueError(
-                                        f"Unsafe tess_vs_paddle_examples folder path: {tess_vs_paddle_examples_folder}"
-                                    )
-                                if not os.path.exists(tess_vs_paddle_examples_folder):
-                                    os.makedirs(tess_vs_paddle_examples_folder)
-                                output_image_path = (
-                                    tess_vs_paddle_examples_folder
-                                    + f"/{safe_text}_conf_{conf}_to_{new_text}_conf_{new_conf}.png"
                                 )
-                                print(f"Saving example image to {output_image_path}")
-                                cropped_image.save(output_image_path)
-                            text = new_text
-                            conf = new_conf
-                        else:
-                            print(
-                                f"  '{text}' (conf: {conf}) -> Paddle result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
                             )
                     else:
-                        # Paddle ran but found nothing, so discard the original low-confidence word
                         print(
-                            f"  '{text}' (conf: {conf}) -> No text found by Paddle. Discarding."
                         )
-                        text = ""
                 else:
-                    # Paddle found nothing, discard original word
                     print(
-                        f"  '{text}' (conf: {conf}) -> No text found by Paddle. Discarding."
                     )
                     text = ""
@@ -809,6 +1277,7 @@ class CustomImageAnalyzerEngine:
                 final_data["width"].append(width)
                 final_data["height"].append(height)
                 final_data["conf"].append(int(conf))
                 # final_data['line_number'].append(int(line_number))
         return final_data
@@ -839,10 +1308,14 @@ class CustomImageAnalyzerEngine:
         image_width, image_height = image.size
         # Note: In testing I haven't seen that this necessarily improves results
-        if self.ocr_engine == "hybrid":
             # Try hybrid with original image for cropping:
             ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
         elif self.ocr_engine == "tesseract":
             ocr_data = pytesseract.image_to_data(
@@ -852,6 +1325,15 @@ class CustomImageAnalyzerEngine:
                 lang=self.tesseract_lang,  # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
         elif self.ocr_engine == "paddle":
             if ocr is None:
@@ -903,15 +1385,33 @@ class CustomImageAnalyzerEngine:
             ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
         else:
             raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
-        if preprocessing_metadata:
-            scale_factor = preprocessing_metadata.get("scale_factor", 1.0)
-            if scale_factor != 1.0:
-                print(f"Rescaling OCR data by scale factor: {scale_factor}")
-                print(f"OCR data before rescaling: {ocr_data}")
             ocr_data = rescale_ocr_data(ocr_data, scale_factor)
         # The rest of your processing pipeline now works for both engines
         ocr_result = ocr_data
@@ -923,6 +1423,20 @@ class CustomImageAnalyzerEngine:
             if text.strip() and int(ocr_result["conf"][i]) > 0
         ]
         return [
             OCRResult(
                 text=clean_unicode_text(ocr_result["text"][i]),
@@ -931,6 +1445,7 @@ class CustomImageAnalyzerEngine:
                 width=ocr_result["width"][i],
                 height=ocr_result["height"][i],
                 conf=round(float(ocr_result["conf"][i]), 0),
                 # line_number=ocr_result['abs_line_id'][i]
             )
             for i in valid_indices
@@ -987,8 +1502,6 @@ class CustomImageAnalyzerEngine:
             if language_supported_entities:
                 text_analyzer_kwargs["entities"] = language_supported_entities
-                # if language != "en":
-                #    gr.Info(f"Using {str(language_supported_entities)} entities for local model analysis for language: {language}")
             else:
                 print(f"No relevant entities supported for language: {language}")
                 raise Warning(
@@ -1944,6 +2457,7 @@ def create_ocr_result_with_children(
                     word.top + word.height,
                 ),
                 "conf": word.conf,
             }
             for word in current_line
         ],

 from tools.config import (
     AWS_PII_OPTION,
+    CONVERT_LINE_TO_WORD_LEVEL,
     DEFAULT_LANGUAGE,
     HYBRID_OCR_CONFIDENCE_THRESHOLD,
     HYBRID_OCR_PADDING,
     LOCAL_OCR_MODEL_OPTIONS,
     LOCAL_PII_OPTION,
+    MAX_NEW_TOKENS,
     OUTPUT_FOLDER,
     PADDLE_DET_DB_UNCLIP_RATIO,
     PADDLE_MODEL_PATH,
     PADDLE_USE_TEXTLINE_ORIENTATION,
     PREPROCESS_LOCAL_OCR_IMAGES,
+    SAVE_EXAMPLE_HYBRID_IMAGES,
     SAVE_PADDLE_VISUALISATIONS,
+    SAVE_TESSERACT_VISUALISATIONS,
+    SELECTED_MODEL,
+    TESSERACT_SEGMENTATION_LEVEL,
 )
 from tools.helper_functions import clean_unicode_text
 from tools.load_spacy_model_custom_recognisers import custom_entities
 from tools.presidio_analyzer_custom import recognizer_result_from_dict
+from tools.run_vlm import generate_image as vlm_generate_image
 from tools.secure_path_utils import validate_folder_containment
 from tools.secure_regex_utils import safe_sanitize_text
     height: int
     conf: float = None
     line: int = None
+    model: str = None  # Track which OCR model was used (e.g., "Tesseract", "Paddle", "VLM")
 @dataclass
             adjusted_contrast = contrast
         return adjusted_image, contrast, adjusted_contrast
+    def _deskew(self, image_np: np.ndarray) -> np.ndarray:
+        """
+        Corrects the skew of an image.
+        This method works best on a grayscaled image.
+        """
+        # We'll work with a copy for angle detection
+        gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY) if len(image_np.shape) == 3 else image_np.copy()
+        # Invert the image for contour finding
+        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
+        coords = np.column_stack(np.where(thresh > 0))
+        angle = cv2.minAreaRect(coords)[-1]
+        # Adjust the angle for rotation
+        if angle < -45:
+            angle = -(90 + angle)
+        else:
+            angle = -angle
+        # Don't rotate if the angle is negligible
+        if abs(angle) < 0.1:
+            return image_np
+        (h, w) = image_np.shape[:2]
+        center = (w // 2, h // 2)
+        M = cv2.getRotationMatrix2D(center, angle, 1.0)
+        # Use the original numpy image for the rotation to preserve quality
+        rotated = cv2.warpAffine(
+            image_np, M, (w, h),
+            flags=cv2.INTER_CUBIC,
+            borderMode=cv2.BORDER_REPLICATE
+        )
+        return rotated
     def preprocess_image(
+        self,
+        image: Image.Image,
+        perform_deskew: bool = False,
+        perform_binarization: bool = False,
     ) -> Tuple[Image.Image, dict]:
         """
+        A pipeline for OCR preprocessing.
+        Order: Deskew -> Greyscale -> Rescale -> Denoise -> Enhance Contrast -> Binarize
         """
+        # 1. Convert PIL image to NumPy array for OpenCV processing
+        # Assuming the original image is RGB
+        image_np = np.array(image.convert("RGB"))
+        # OpenCV uses BGR, so we convert RGB to BGR
+        image_np_bgr = cv2.cvtColor(image_np, cv2.COLOR_RGB2BGR)
+        # --- REVISED PIPELINE ---
+        # 2. Deskew the image (critical new step)
+        # This is best done early on the full-quality image.
+        if perform_deskew:
+            deskewed_image_np = self._deskew(image_np_bgr)
+        else:
+            deskewed_image_np = image_np_bgr
+        # 3. Convert to greyscale
+        # Your convert_image_to_array probably does this, but for clarity:
+        gray_image_np = cv2.cvtColor(deskewed_image_np, cv2.COLOR_BGR2GRAY)
+        # 4. Rescale image to optimal DPI
+        # Assuming your image_rescaling object can handle a greyscale numpy array
         rescaled_image_np, scale_metadata = self.image_rescaling.preprocess_image(
+            gray_image_np
         )
+        # 5. Apply filtering for noise reduction
+        # Suggestion: A Median filter is often very effective for scanned docs
+        # filtered_image_np = cv2.medianBlur(rescaled_image_np, 3)
+        # Or using your existing bilateral filter:
         filtered_image_np, _ = self.bilateral_filter.preprocess_image(rescaled_image_np)
+        # 6. Improve contrast
         adjusted_image_np, _, _ = self._improve_contrast(filtered_image_np)
+        # 7. Adaptive Thresholding (Binarization) - Final optional step
         if perform_binarization:
             final_image_np, threshold_metadata = (
                 self.adaptive_threshold.preprocess_image(adjusted_image_np)
         final_metadata = {**scale_metadata, **threshold_metadata}
         # Convert final numpy array back to PIL Image for return
+        # The final image is greyscale, so it's safe to use 'L' mode
+        return Image.fromarray(final_image_np).convert('L'), final_metadata
 def rescale_ocr_data(ocr_data, scale_factor: float):
         print(f"No entities provided for language: {language}")
         # raise Warning(f"No entities provided for language: {language}")
     filtered_entities = [
         entity for entity in entities if entity in valid_language_entities
     ]
     return filtered_entities
+def _get_tesseract_psm(segmentation_level: str) -> int:
+    """
+    Get the appropriate Tesseract PSM (Page Segmentation Mode) value based on segmentation level.
+    Args:
+        segmentation_level: "word" or "line"
+    Returns:
+        PSM value for Tesseract configuration
+    """
+    if segmentation_level.lower() == "line":
+        return 6  # Uniform block of text
+    elif segmentation_level.lower() == "word":
+        return 11  # Sparse text (word-level)
+    else:
+        print(
+            f"Warning: Unknown segmentation level '{segmentation_level}', defaulting to word-level (PSM 11)"
+        )
+        return 11
+def _vlm_ocr_predict(
+    image: Image.Image,
+    prompt: str = "Extract all text from this image. Return only the text, no other information.",
+) -> Dict[str, Any]:
+    """
+    VLM OCR prediction function that mimics PaddleOCR's interface.
+    Args:
+        image: PIL Image to process
+        prompt: Text prompt for the VLM
+    Returns:
+        Dictionary in PaddleOCR format with 'rec_texts' and 'rec_scores'
+    """
+    try:
+        # Use the VLM to extract text
+        extracted_text = vlm_generate_image(
+            text=prompt,
+            image=image,
+            max_new_tokens=MAX_NEW_TOKENS,
+            temperature=0.7,
+            top_p=0.9,
+            top_k=50,
+            repetition_penalty=1.3,
+        )
+        if extracted_text and extracted_text.strip():
+            # Clean the text
+            cleaned_text = extracted_text.strip()
+            # Split into words for compatibility with PaddleOCR format
+            words = cleaned_text.split()
+            # Create PaddleOCR-compatible result
+            result = {
+                "rec_texts": words,
+                "rec_scores": [0.95] * len(words),  # High confidence for VLM results
+            }
+            return result
+        else:
+            return {"rec_texts": [], "rec_scores": []}
+    except Exception as e:
+        print(f"VLM OCR error: {e}")
+        return {"rec_texts": [], "rec_scores": []}
 class CustomImageAnalyzerEngine:
     def __init__(
         self,
         """
         Initializes the CustomImageAnalyzerEngine.
+        :param ocr_engine: The OCR engine to use ("tesseract", "hybrid-paddle", "hybrid-vlm", or "paddle").
         :param analyzer_engine: The Presidio AnalyzerEngine instance.
+        :param tesseract_config: Configuration string for Tesseract. If None, uses TESSERACT_SEGMENTATION_LEVEL config.
         :param paddle_kwargs: Dictionary of keyword arguments for PaddleOCR constructor.
         :param image_preprocessor: Optional image preprocessor.
         :param language: Preferred OCR language (e.g., "en", "fr", "de"). Defaults to DEFAULT_LANGUAGE.
             )
         self.output_folder = normalized_output_folder
+        if self.ocr_engine == "paddle" or self.ocr_engine == "hybrid-paddle":
             if PaddleOCR is None:
                 raise ImportError(
                     "paddleocr is not installed. Please run 'pip install paddleocr paddlepaddle' in your python environment and retry."
                 paddle_kwargs.setdefault("lang", self.paddle_lang)
             self.paddle_ocr = PaddleOCR(**paddle_kwargs)
+        elif self.ocr_engine == "hybrid-vlm":
+            # VLM-based hybrid OCR - no additional initialization needed
+            # The VLM model is loaded when run_vlm.py is imported
+            print(f"Initializing hybrid VLM OCR with model: {SELECTED_MODEL}")
+            self.paddle_ocr = None  # Not using PaddleOCR
         if not analyzer_engine:
             analyzer_engine = AnalyzerEngine()
         self.analyzer_engine = analyzer_engine
+        # Set Tesseract configuration based on segmentation level
+        if tesseract_config:
+            self.tesseract_config = tesseract_config
+        else:
+            psm_value = _get_tesseract_psm(TESSERACT_SEGMENTATION_LEVEL)
+            self.tesseract_config = f"--oem 3 --psm {psm_value}"
+            # print(
+            #     f"Tesseract configured for {TESSERACT_SEGMENTATION_LEVEL}-level segmentation (PSM {psm_value})"
+            # )
         if not image_preprocessor:
             image_preprocessor = ContrastSegmentedImageEnhancer()
         self.image_preprocessor = image_preprocessor
+    def _sanitize_filename(
+        self, text: str, max_length: int = 20, fallback_prefix: str = "unknown_text"
+    ) -> str:
         """
         Sanitizes text for use in filenames by removing invalid characters and limiting length.
         :param text: The text to sanitize
         :param max_length: Maximum length of the sanitized text
+        :param fallback_prefix: Prefix to use if sanitization fails
         :return: Sanitized text safe for filenames
         """
         # If empty after sanitization, use a default value
         if not sanitized:
+            sanitized = fallback_prefix
         # Limit to max_length characters
         if len(sanitized) > max_length:
             # Ensure we don't end with an underscore if we cut in the middle
             sanitized = sanitized.rstrip("_")
+        # Final check: if still empty or too short, use fallback
+        if not sanitized or len(sanitized) < 3:
+            sanitized = fallback_prefix
         return sanitized
+    def _create_safe_filename_with_confidence(
+        self,
+        original_text: str,
+        new_text: str,
+        conf: int,
+        new_conf: int,
+        ocr_type: str = "OCR",
+    ) -> str:
+        """
+        Creates a safe filename using confidence values when text sanitization fails.
+        Args:
+            original_text: Original text from Tesseract
+            new_text: New text from VLM/PaddleOCR
+            conf: Original confidence score
+            new_conf: New confidence score
+            ocr_type: Type of OCR used (VLM, Paddle, etc.)
+        Returns:
+            Safe filename string
+        """
+        # Try to sanitize both texts
+        safe_original = self._sanitize_filename(
+            original_text, max_length=15, fallback_prefix=f"orig_conf_{conf}"
+        )
+        safe_new = self._sanitize_filename(
+            new_text, max_length=15, fallback_prefix=f"new_conf_{new_conf}"
+        )
+        # If both sanitizations resulted in fallback names, create a confidence-based name
+        if safe_original.startswith("unknown_text") and safe_new.startswith(
+            "unknown_text"
+        ):
+            return f"{ocr_type}_conf_{conf}_to_conf_{new_conf}"
+        return f"{safe_original}_conf_{conf}_to_{safe_new}_conf_{new_conf}"
+    def _convert_line_to_word_level(
+        self, line_data: Dict[str, List], image_width: int, image_height: int
+    ) -> Dict[str, List]:
+        """
+        Converts line-level OCR results to word-level results by splitting text and estimating word positions.
+        Args:
+            line_data: Dictionary with line-level OCR data (text, left, top, width, height, conf)
+            image_width: Width of the original image
+            image_height: Height of the original image
+        Returns:
+            Dictionary with word-level OCR data in Tesseract format
+        """
+        output = {
+            "text": list(),
+            "left": list(),
+            "top": list(),
+            "width": list(),
+            "height": list(),
+            "conf": list(),
+        }
+        if not line_data or not line_data.get("text"):
+            return output
+        for i in range(len(line_data["text"])):
+            line_text = line_data["text"][i]
+            line_left = line_data["left"][i]
+            line_top = line_data["top"][i]
+            line_width = line_data["width"][i]
+            line_height = line_data["height"][i]
+            line_conf = line_data["conf"][i]
+            # Skip empty lines
+            if not line_text.strip():
+                continue
+            # Split line into words
+            words = line_text.split()
+            if not words:
+                continue
+            # Calculate character width for this line
+            total_chars = len(line_text)
+            avg_char_width = line_width / total_chars if total_chars > 0 else 0
+            current_char_offset = 0
+            for word in words:
+                # Calculate word width based on character count
+                word_width = float(len(word) * avg_char_width)
+                word_left = line_left + float(current_char_offset * avg_char_width)
+                # Ensure word doesn't exceed image boundaries
+                word_left = max(0, min(word_left, image_width - word_width))
+                word_width = min(word_width, image_width - word_left)
+                output["text"].append(word)
+                output["left"].append(word_left)
+                output["top"].append(line_top)
+                output["width"].append(word_width)
+                output["height"].append(line_height)
+                output["conf"].append(line_conf)
+                # Update offset for the next word (add word length + 1 for the space)
+                current_char_offset += len(word) + 1
+        return output
+    def _is_line_level_data(self, ocr_data: Dict[str, List]) -> bool:
+        """
+        Determines if OCR data contains line-level results (multiple words per bounding box).
+        Args:
+            ocr_data: Dictionary with OCR data
+        Returns:
+            True if data appears to be line-level, False otherwise
+        """
+        if not ocr_data or not ocr_data.get("text"):
+            return False
+        # Check if any text entries contain multiple words
+        for text in ocr_data["text"]:
+            if text.strip() and len(text.split()) > 1:
+                return True
+        return False
     def _convert_paddle_to_tesseract_format(
         self, paddle_results: List[Any]
     ) -> Dict[str, List]:
                 line_width = float(max(x_coords) - line_left)
                 line_height = float(max(y_coords) - line_top)
+                # Add line-level data
+                output["text"].append(line_text)
+                output["left"].append(line_left)
+                output["top"].append(line_top)
+                output["width"].append(line_width)
+                output["height"].append(line_height)
+                output["conf"].append(int(line_confidence * 100))
+        return output
+    def _visualize_tesseract_bounding_boxes(
+        self,
+        image: Image.Image,
+        ocr_data: Dict[str, List],
+        image_name: str = None,
+        visualisation_folder: str = "tesseract_visualisations",
+    ) -> None:
+        """
+        Visualizes Tesseract OCR bounding boxes with confidence-based colors and a legend.
+        Args:
+            image: The PIL Image object
+            ocr_data: Tesseract OCR data dictionary
+            image_name: Optional name for the saved image file
+        """
+        if not ocr_data or not ocr_data.get("text"):
+            return
+        # Convert PIL image to OpenCV format
+        image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+        # Get image dimensions
+        height, width = image_cv.shape[:2]
+        # Define confidence ranges and colors
+        confidence_ranges = [
+            (80, 100, (0, 255, 0), "High (80-100%)"),  # Green
+            (50, 79, (0, 165, 255), "Medium (50-79%)"),  # Orange
+            (0, 49, (0, 0, 255), "Low (0-49%)"),  # Red
+        ]
+        # Process each detected text element
+        for i in range(len(ocr_data["text"])):
+            text = ocr_data["text"][i]
+            conf = int(ocr_data["conf"][i])
+            # Skip empty text or invalid confidence
+            if not text.strip() or conf == -1:
+                continue
+            left = ocr_data["left"][i]
+            top = ocr_data["top"][i]
+            width_box = ocr_data["width"][i]
+            height_box = ocr_data["height"][i]
+            # Calculate bounding box coordinates
+            x1 = int(left)
+            y1 = int(top)
+            x2 = int(left + width_box)
+            y2 = int(top + height_box)
+            # Ensure coordinates are within image bounds
+            x1 = max(0, min(x1, width))
+            y1 = max(0, min(y1, height))
+            x2 = max(0, min(x2, width))
+            y2 = max(0, min(y2, height))
+            # Skip if bounding box is invalid
+            if x2 <= x1 or y2 <= y1:
+                continue
+            # Determine color based on confidence score
+            color = (0, 0, 255)  # Default to red
+            for min_conf, max_conf, conf_color, _ in confidence_ranges:
+                if min_conf <= conf <= max_conf:
+                    color = conf_color
+                    break
+            # Draw bounding box
+            cv2.rectangle(image_cv, (x1, y1), (x2, y2), color, 1)
+        # Add legend
+        self._add_confidence_legend(image_cv, confidence_ranges)
+        # Save the visualization
+        tesseract_viz_folder = os.path.join(self.output_folder, visualisation_folder)
+        # Double-check the constructed path is safe
+        if not validate_folder_containment(tesseract_viz_folder, OUTPUT_FOLDER):
+            raise ValueError(
+                f"Unsafe tesseract visualisations folder path: {tesseract_viz_folder}"
+            )
+        os.makedirs(tesseract_viz_folder, exist_ok=True)
+        # Generate filename
+        if image_name:
+            # Remove file extension if present
+            base_name = os.path.splitext(image_name)[0]
+            filename = f"{base_name}_{visualisation_folder}.jpg"
+        else:
+            timestamp = int(time.time())
+            filename = f"{visualisation_folder}_{timestamp}.jpg"
+        output_path = os.path.join(tesseract_viz_folder, filename)
+        # Save the image
+        cv2.imwrite(output_path, image_cv)
+        print(f"Tesseract visualization saved to: {output_path}")
+    def _add_confidence_legend(
+        self, image_cv: np.ndarray, confidence_ranges: List[Tuple]
+    ) -> None:
+        """
+        Adds a confidence legend to the visualization image.
+        Args:
+            image_cv: OpenCV image array
+            confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
+        """
+        height, width = image_cv.shape[:2]
+        # Legend parameters
+        legend_width = 200
+        legend_height = 100
+        legend_x = width - legend_width - 20
+        legend_y = 20
+        # Draw legend background
+        cv2.rectangle(
+            image_cv,
+            (legend_x, legend_y),
+            (legend_x + legend_width, legend_y + legend_height),
+            (255, 255, 255),  # White background
+            -1,
+        )
+        cv2.rectangle(
+            image_cv,
+            (legend_x, legend_y),
+            (legend_x + legend_width, legend_y + legend_height),
+            (0, 0, 0),  # Black border
+            2,
+        )
+        # Add title
+        title_text = "Confidence Levels"
+        font_scale = 0.6
+        font_thickness = 2
+        (title_width, title_height), _ = cv2.getTextSize(
+            title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
+        )
+        title_x = legend_x + (legend_width - title_width) // 2
+        title_y = legend_y + title_height + 10
+        cv2.putText(
+            image_cv,
+            title_text,
+            (title_x, title_y),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            font_scale,
+            (0, 0, 0),  # Black text
+            font_thickness,
+        )
+        # Add confidence range items
+        item_spacing = 25
+        start_y = title_y + 25
+        for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
+            item_y = start_y + i * item_spacing
+            # Draw color box
+            box_size = 15
+            box_x = legend_x + 10
+            box_y = item_y - box_size
+            cv2.rectangle(
+                image_cv,
+                (box_x, box_y),
+                (box_x + box_size, box_y + box_size),
+                color,
+                -1,
+            )
+            cv2.rectangle(
+                image_cv,
+                (box_x, box_y),
+                (box_x + box_size, box_y + box_size),
+                (0, 0, 0),  # Black border
+                1,
+            )
+            # Add label text
+            label_x = box_x + box_size + 10
+            label_y = item_y - 5
+            cv2.putText(
+                image_cv,
+                label,
+                (label_x, label_y),
+                cv2.FONT_HERSHEY_SIMPLEX,
+                0.5,
+                (0, 0, 0),  # Black text
+                1,
+            )
     def _perform_hybrid_ocr(
         self,
         image_name: str = "unknown_image_name",
     ) -> Dict[str, list]:
         """
+        Performs OCR using Tesseract for bounding boxes and PaddleOCR/VLM for low-confidence text.
         Returns data in the same dictionary format as pytesseract.image_to_data.
         """
+        # Determine if we're using VLM or PaddleOCR
+        use_vlm = self.ocr_engine == "hybrid-vlm"
+        if not use_vlm:
+            if ocr is None:
+                if hasattr(self, "paddle_ocr") and self.paddle_ocr is not None:
+                    ocr = self.paddle_ocr
+                else:
+                    raise ValueError(
+                        "No OCR object provided and 'paddle_ocr' is not initialized."
+                    )
         print("Starting hybrid OCR process...")
             "width": list(),
             "height": list(),
             "conf": list(),
+            "model": list(),  # Track which model was used for each word
         }
         num_words = len(tesseract_data["text"])
             height = tesseract_data["height"][i]
             # line_number = tesseract_data['abs_line_id'][i]
+            # Initialize model as Tesseract (default)
+            model_used = "Tesseract"
             # If confidence is low, use PaddleOCR for a second opinion
             if conf < confidence_threshold:
                 img_width, img_height = image.size
                 cropped_image = image.crop(
                     (crop_left, crop_top, crop_right, crop_bottom)
                 )
+                if use_vlm:
+                    # Use VLM for OCR
+                    vlm_result = _vlm_ocr_predict(cropped_image)
+                    rec_texts = vlm_result.get("rec_texts", [])
+                    rec_scores = vlm_result.get("rec_scores", [])
+                else:
+                    # Use PaddleOCR
+                    cropped_image_np = np.array(cropped_image)
+                    if len(cropped_image_np.shape) == 2:
+                        cropped_image_np = np.stack([cropped_image_np] * 3, axis=-1)
+                    paddle_results = ocr.predict(cropped_image_np)
+                    if paddle_results and paddle_results[0]:
+                        rec_texts = paddle_results[0].get("rec_texts", [])
+                        rec_scores = paddle_results[0].get("rec_scores", [])
+                    else:
+                        rec_texts = []
+                        rec_scores = []
+                if rec_texts and rec_scores:
+                    new_text = " ".join(rec_texts)
+                    new_conf = int(round(np.median(rec_scores) * 100, 0))
+                    # Only replace if Paddle's/VLM's confidence is better
+                    if new_conf > conf:
+                        ocr_type = "VLM" if use_vlm else "Paddle"
+                        print(
+                            f"  Re-OCR'd word: '{text}' (conf: {conf}) -> '{new_text}' (conf: {new_conf:.0f}) [{ocr_type}]"
+                        )
+                        # For exporting example image comparisons, not used here
+                        safe_filename = self._create_safe_filename_with_confidence(
+                            text, new_text, conf, new_conf, ocr_type
+                        )
+                        if SAVE_EXAMPLE_HYBRID_IMAGES is True:
+                            # Normalize and validate image_name to prevent path traversal attacks
+                            normalized_image_name = os.path.normpath(image_name + "_" + ocr_type)
+                            # Ensure the image name doesn't contain path traversal characters
+                            if (
+                                ".." in normalized_image_name
+                                or "/" in normalized_image_name
+                                or "\\" in normalized_image_name
+                            ):
+                                normalized_image_name = (
+                                    "safe_image"  # Fallback to safe default
                                 )
+                            hybrid_ocr_examples_folder = (
+                                self.output_folder
+                                + f"/hybrid_ocr_examples/{normalized_image_name}"
+                            )
+                            # Validate the constructed path is safe before creating directories
+                            if not validate_folder_containment(
+                                hybrid_ocr_examples_folder, OUTPUT_FOLDER
+                            ):
+                                raise ValueError(
+                                    f"Unsafe hybrid_ocr_examples folder path: {hybrid_ocr_examples_folder}"
+                                )
+                            if not os.path.exists(hybrid_ocr_examples_folder):
+                                os.makedirs(hybrid_ocr_examples_folder)
+                            output_image_path = (
+                                hybrid_ocr_examples_folder + f"/{safe_filename}.png"
                             )
+                            print(f"Saving example image to {output_image_path}")
+                            cropped_image.save(output_image_path)
+                        text = new_text
+                        conf = new_conf
+                        model_used = ocr_type  # Update model to VLM or Paddle
                     else:
+                        ocr_type = "VLM" if use_vlm else "Paddle"
                         print(
+                            f"  '{text}' (conf: {conf}) -> {ocr_type} result '{new_text}' (conf: {new_conf:.0f}) was not better. Keeping original."
                         )
                 else:
+                    # OCR ran but found nothing, discard original word
+                    ocr_type = "VLM" if use_vlm else "Paddle"
                     print(
+                        f"  '{text}' (conf: {conf}) -> No text found by {ocr_type}. Discarding."
                     )
                     text = ""
                 final_data["width"].append(width)
                 final_data["height"].append(height)
                 final_data["conf"].append(int(conf))
+                final_data["model"].append(model_used)
                 # final_data['line_number'].append(int(line_number))
         return final_data
         image_width, image_height = image.size
         # Note: In testing I haven't seen that this necessarily improves results
+        if self.ocr_engine == "hybrid-paddle":
             # Try hybrid with original image for cropping:
             ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
+        elif self.ocr_engine == "hybrid-vlm":
+            # Try hybrid VLM with original image for cropping:
+            ocr_data = self._perform_hybrid_ocr(image, image_name=image_name)
         elif self.ocr_engine == "tesseract":
             ocr_data = pytesseract.image_to_data(
                 lang=self.tesseract_lang,  # Ensure the Tesseract language data (e.g., fra.traineddata) is installed on your system.
             )
+            # Save Tesseract visualization with bounding boxes
+            # if SAVE_TESSERACT_VISUALISATIONS is True:
+            #     self._visualize_tesseract_bounding_boxes(
+            #         image,
+            #         ocr_data,
+            #         image_name,
+            #         visualisation_folder="tesseract_visualisations",
+            #     )
         elif self.ocr_engine == "paddle":
             if ocr is None:
             ocr_data = self._convert_paddle_to_tesseract_format(paddle_results)
+            # if SAVE_PADDLE_VISUALISATIONS is True:
+            #     # Save Paddle visualization with bounding boxes
+            #     self._visualize_tesseract_bounding_boxes(
+            #         image,
+            #         ocr_data,
+            #         image_name,
+            #         visualisation_folder="paddle_visualisations",
+            #     )
         else:
             raise RuntimeError(f"Unsupported OCR engine: {self.ocr_engine}")
+        # Convert line-level results to word-level if configured and needed
+        if CONVERT_LINE_TO_WORD_LEVEL and self._is_line_level_data(ocr_data):
+            print("Converting line-level OCR results to word-level...")
+            ocr_data = self._convert_line_to_word_level(
+                ocr_data, image_width, image_height
+            )
+        # Always check for scale_factor, even if preprocessing_metadata is empty
+        # This ensures rescaling happens correctly when preprocessing was applied
+        scale_factor = preprocessing_metadata.get("scale_factor", 1.0) if preprocessing_metadata else 1.0
+        if scale_factor != 1.0:
+            # print(f"Rescaling OCR data by scale factor: {scale_factor} (converting from preprocessed to original image coordinates)")
+            # print(f"OCR data before rescaling (first 3 entries): {dict((k, v[:3] if isinstance(v, list) else v) for k, v in list(ocr_data.items())[:3])}")
             ocr_data = rescale_ocr_data(ocr_data, scale_factor)
+            # print(f"OCR data after rescaling (first 3 entries): {dict((k, v[:3] if isinstance(v, list) else v) for k, v in list(ocr_data.items())[:3])}")
         # The rest of your processing pipeline now works for both engines
         ocr_result = ocr_data
             if text.strip() and int(ocr_result["conf"][i]) > 0
         ]
+        # Determine default model based on OCR engine if model field is not present
+        if "model" in ocr_result and len(ocr_result["model"]) == len(ocr_result["text"]):
+            # Model field exists and has correct length - use it
+            get_model = lambda idx: ocr_result["model"][idx]
+        else:
+            # Model field not present or incorrect length - use default based on engine
+            default_model = (
+                "Tesseract" if self.ocr_engine == "tesseract" else
+                "Paddle" if self.ocr_engine == "paddle" else
+                "hybrid-paddle" if self.ocr_engine == "hybrid-paddle" else
+                "VLM" if self.ocr_engine == "hybrid-vlm" else None
+            )
+            get_model = lambda idx: default_model
         return [
             OCRResult(
                 text=clean_unicode_text(ocr_result["text"][i]),
                 width=ocr_result["width"][i],
                 height=ocr_result["height"][i],
                 conf=round(float(ocr_result["conf"][i]), 0),
+                model=get_model(i),
                 # line_number=ocr_result['abs_line_id'][i]
             )
             for i in valid_indices
             if language_supported_entities:
                 text_analyzer_kwargs["entities"] = language_supported_entities
             else:
                 print(f"No relevant entities supported for language: {language}")
                 raise Warning(
                     word.top + word.height,
                 ),
                 "conf": word.conf,
+                "model": word.model,
             }
             for word in current_line
         ],

tools/file_redaction.py CHANGED Viewed

@@ -8,7 +8,9 @@ from datetime import datetime
 from typing import Any, Dict, List, Optional, Tuple
 import boto3
 import gradio as gr
 import pandas as pd
 import pymupdf
 from gradio import Progress
@@ -53,11 +55,14 @@ from tools.config import (
     MAX_TIME_VALUE,
     NO_REDACTION_PII_OPTION,
     OUTPUT_FOLDER,
     PAGE_BREAK_VALUE,
     PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
     RETURN_PDF_FOR_REVIEW,
     RETURN_REDACTED_PDF,
     RUN_AWS_FUNCTIONS,
     SELECTABLE_TEXT_EXTRACT_OPTION,
     TESSERACT_TEXT_EXTRACT_OPTION,
     TEXTRACT_TEXT_EXTRACT_OPTION,
@@ -104,6 +109,7 @@ from tools.load_spacy_model_custom_recognisers import (
 )
 from tools.secure_path_utils import (
     secure_file_write,
     validate_path_containment,
 )
@@ -322,7 +328,7 @@ def choose_and_run_redactor(
     - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
-    - chosen_local_model (str): Which local model is being used for OCR on images - uses the value of CHOSEN_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid" to combine both.
     - language (str, optional): The language of the text in the files. Defaults to English.
     - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
     - ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
@@ -978,8 +984,10 @@ def choose_and_run_redactor(
         )
         if not all_page_line_level_ocr_results_with_words:
-            if local_ocr_output_found_checkbox is True and os.path.exists(
-                all_page_line_level_ocr_results_with_words_json_file_path
             ):
                 (
                     all_page_line_level_ocr_results_with_words,
@@ -1010,7 +1018,7 @@ def choose_and_run_redactor(
             (
                 pymupdf_doc,
                 all_pages_decision_process_table,
-                out_file_paths,
                 new_textract_request_metadata,
                 annotations_all_pages,
                 current_loop_page,
@@ -3118,7 +3126,7 @@ def redact_image_pdf(
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
     - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
-    - chosen_local_model (str, optional): The local model chosen for OCR. Defaults to CHOSEN_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid" for a combination of both.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
@@ -3207,11 +3215,16 @@ def redact_image_pdf(
     # If running Textract, check if file already exists. If it does, load in existing data
     if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
         textract_json_file_path = output_folder + file_name + "_textract.json"
-        textract_data, is_missing, log_files_output_paths = (
-            load_and_convert_textract_json(
-                textract_json_file_path, log_files_output_paths, page_sizes_df
             )
-        )
         original_textract_data = textract_data.copy()
         # print("Successfully loaded in Textract analysis results from file")
@@ -3221,15 +3234,20 @@ def redact_image_pdf(
         all_page_line_level_ocr_results_with_words_json_file_path = (
             output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
         )
-        (
-            all_page_line_level_ocr_results_with_words,
-            is_missing,
-            log_files_output_paths,
-        ) = load_and_convert_ocr_results_with_words_json(
-            all_page_line_level_ocr_results_with_words_json_file_path,
-            log_files_output_paths,
-            page_sizes_df,
-        )
         original_all_page_line_level_ocr_results_with_words = (
             all_page_line_level_ocr_results_with_words.copy()
         )
@@ -3536,6 +3554,24 @@ def redact_image_pdf(
                     line_level_ocr_results_df.to_dict("records")
                 )
             if (
                 pii_identification_method != NO_REDACTION_PII_OPTION
                 or RETURN_PDF_FOR_REVIEW is True
@@ -4867,3 +4903,395 @@ def redact_text_pdf(
         comprehend_query_number,
         all_page_line_level_ocr_results_with_words,
     )

 from typing import Any, Dict, List, Optional, Tuple
 import boto3
+import cv2
 import gradio as gr
+import numpy as np
 import pandas as pd
 import pymupdf
 from gradio import Progress
     MAX_TIME_VALUE,
     NO_REDACTION_PII_OPTION,
     OUTPUT_FOLDER,
+    OVERWRITE_EXISTING_OCR_RESULTS,
     PAGE_BREAK_VALUE,
     PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
     RETURN_PDF_FOR_REVIEW,
     RETURN_REDACTED_PDF,
     RUN_AWS_FUNCTIONS,
+    SAVE_TEXTRACT_VISUALISATIONS,
+    SAVE_TESSERACT_VISUALISATIONS,
     SELECTABLE_TEXT_EXTRACT_OPTION,
     TESSERACT_TEXT_EXTRACT_OPTION,
     TEXTRACT_TEXT_EXTRACT_OPTION,
 )
 from tools.secure_path_utils import (
     secure_file_write,
+    validate_folder_containment,
     validate_path_containment,
 )
     - all_page_line_level_ocr_results (list, optional): All line level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words (list, optional): All word level text on the page with bounding boxes.
     - all_page_line_level_ocr_results_with_words_df (pd.Dataframe, optional): All word level text on the page with bounding boxes as a dataframe.
+    - chosen_local_model (str): Which local model is being used for OCR on images - uses the value of CHOSEN_LOCAL_OCR_MODEL by default, choices are "tesseract", "paddle" for PaddleOCR, or "hybrid-paddle" to combine both.
     - language (str, optional): The language of the text in the files. Defaults to English.
     - language (str, optional): The language to do AWS Comprehend calls. Defaults to value of language if not provided.
     - ocr_review_files (list, optional): A list of OCR review files to be used for the redaction process. Defaults to an empty list.
         )
         if not all_page_line_level_ocr_results_with_words:
+            if (
+                not OVERWRITE_EXISTING_OCR_RESULTS
+                and local_ocr_output_found_checkbox is True
+                and os.path.exists(all_page_line_level_ocr_results_with_words_json_file_path)
             ):
                 (
                     all_page_line_level_ocr_results_with_words,
             (
                 pymupdf_doc,
                 all_pages_decision_process_table,
+                log_files_output_paths,
                 new_textract_request_metadata,
                 annotations_all_pages,
                 current_loop_page,
     - text_extraction_only (bool, optional): Should the function only extract text, or also do redaction.
     - all_page_line_level_ocr_results (optional): List of all page line level OCR results.
     - all_page_line_level_ocr_results_with_words (optional): List of all page line level OCR results with words.
+    - chosen_local_model (str, optional): The local model chosen for OCR. Defaults to CHOSEN_LOCAL_OCR_MODEL, other choices are "paddle" for PaddleOCR, or "hybrid-paddle" for a combination of both.
     - page_break_val (int, optional): The value at which to trigger a page break. Defaults to PAGE_BREAK_VALUE.
     - log_files_output_paths (List, optional): List of file paths used for saving redaction process logging results.
     - max_time (int, optional): The maximum amount of time (s) that the function should be running before it breaks. To avoid timeout errors with some APIs.
     # If running Textract, check if file already exists. If it does, load in existing data
     if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
         textract_json_file_path = output_folder + file_name + "_textract.json"
+        if OVERWRITE_EXISTING_OCR_RESULTS:
+            # Skip loading existing results, start fresh
+            textract_data = {}
+            is_missing = True
+        else:
+            textract_data, is_missing, log_files_output_paths = (
+                load_and_convert_textract_json(
+                    textract_json_file_path, log_files_output_paths, page_sizes_df
+                )
             )
         original_textract_data = textract_data.copy()
         # print("Successfully loaded in Textract analysis results from file")
         all_page_line_level_ocr_results_with_words_json_file_path = (
             output_folder + file_name + "_ocr_results_with_words_local_ocr.json"
         )
+        if OVERWRITE_EXISTING_OCR_RESULTS:
+            # Skip loading existing results, start fresh
+            all_page_line_level_ocr_results_with_words = []
+            is_missing = True
+        else:
+            (
+                all_page_line_level_ocr_results_with_words,
+                is_missing,
+                log_files_output_paths,
+            ) = load_and_convert_ocr_results_with_words_json(
+                all_page_line_level_ocr_results_with_words_json_file_path,
+                log_files_output_paths,
+                page_sizes_df,
+            )
         original_all_page_line_level_ocr_results_with_words = (
             all_page_line_level_ocr_results_with_words.copy()
         )
                     line_level_ocr_results_df.to_dict("records")
                 )
+            # Save OCR visualization with bounding boxes (works for all OCR methods)
+            if (
+                text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION
+                and SAVE_TEXTRACT_VISUALISATIONS is True
+            ) or (
+                text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION
+                and SAVE_TESSERACT_VISUALISATIONS is True
+            ):
+                if page_line_level_ocr_results_with_words and "results" in page_line_level_ocr_results_with_words:
+                    log_files_output_paths = visualise_ocr_words_bounding_boxes(
+                        image,
+                        page_line_level_ocr_results_with_words["results"],
+                        image_name=f"{file_name}_{reported_page_number}",
+                        output_folder=output_folder,
+                        text_extraction_method=text_extraction_method,
+                        log_files_output_paths=log_files_output_paths,
+                    )
             if (
                 pii_identification_method != NO_REDACTION_PII_OPTION
                 or RETURN_PDF_FOR_REVIEW is True
         comprehend_query_number,
         all_page_line_level_ocr_results_with_words,
     )
+def visualise_ocr_words_bounding_boxes(
+    image: "Image.Image",
+    ocr_results: Dict[str, Any],
+    image_name: str = None,
+    output_folder: str = OUTPUT_FOLDER,
+    text_extraction_method: str = None,
+    visualisation_folder: str = None,
+    add_legend: bool = True,
+    log_files_output_paths: List[str] = [],
+) -> None:
+    """
+    Visualizes OCR bounding boxes with confidence-based colors and a legend.
+    Handles word-level OCR results from Textract and Tesseract.
+    Args:
+        image: The PIL Image object
+        ocr_results: Dictionary containing word-level OCR results
+        image_name: Optional name for the saved image file
+        output_folder: Output folder path
+        text_extraction_method: The text extraction method being used (determines folder name)
+        visualisation_folder: Subfolder name for visualizations (auto-determined if not provided)
+        add_legend: Whether to add a legend to the visualization
+        log_files_output_paths: List of file paths used for saving redaction process logging results.
+    """
+    # Determine visualization folder based on text extraction method
+    if visualisation_folder is None:
+        if text_extraction_method == TEXTRACT_TEXT_EXTRACT_OPTION:
+            visualisation_folder = "textract_visualisations"
+        elif text_extraction_method == TESSERACT_TEXT_EXTRACT_OPTION:
+            visualisation_folder = "tesseract_visualisations"
+        else:
+            visualisation_folder = "ocr_visualisations"
+    if not ocr_results:
+        return
+    # Convert PIL image to OpenCV format
+    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    # Get image dimensions
+    height, width = image_cv.shape[:2]
+    # Define confidence ranges and colors for bounding boxes (bright colors)
+    confidence_ranges = [
+        (80, 100, (0, 255, 0), "High (80-100%)"),  # Green
+        (50, 79, (0, 165, 255), "Medium (50-79%)"),  # Orange
+        (0, 49, (0, 0, 255), "Low (0-49%)"),  # Red
+    ]
+    # Define darker colors for text on white background
+    text_confidence_ranges = [
+        (80, 100, (0, 150, 0), "High (80-100%)"),  # Dark Green
+        (50, 79, (0, 100, 200), "Medium (50-79%)"),  # Dark Orange
+        (0, 49, (0, 0, 180), "Low (0-49%)"),  # Dark Red
+    ]
+    # Process each line's words
+    for line_key, line_data in ocr_results.items():
+        if not isinstance(line_data, dict) or 'words' not in line_data:
+            continue
+        words = line_data.get('words', [])
+        # Process each word in the line
+        for word_data in words:
+            if not isinstance(word_data, dict):
+                continue
+            text = word_data.get('text', '')
+            # Handle both 'conf' and 'confidence' field names for compatibility
+            conf = int(word_data.get('conf', word_data.get('confidence', 0)))
+            # Skip empty text or invalid confidence
+            if not text.strip() or conf == -1:
+                continue
+            # Get bounding box coordinates
+            bbox = word_data.get('bounding_box', (0, 0, 0, 0))
+            if len(bbox) != 4:
+                continue
+            x1, y1, x2, y2 = bbox
+            # Ensure coordinates are within image bounds
+            x1 = max(0, min(int(x1), width))
+            y1 = max(0, min(int(y1), height))
+            x2 = max(0, min(int(x2), width))
+            y2 = max(0, min(int(y2), height))
+            # Skip if bounding box is invalid
+            if x2 <= x1 or y2 <= y1:
+                continue
+            # Check if word was replaced by a different model
+            model = word_data.get('model', None)
+            is_replaced = model and model != "Tesseract"
+            # Determine bounding box color: grey for replaced words, otherwise based on confidence
+            # if is_replaced:
+            #     box_color = (128, 128, 128)  # Grey for model replacements (bounding box only)
+            # else:
+            box_color = (0, 0, 255)  # Default to red
+            for min_conf, max_conf, conf_color, _ in confidence_ranges:
+                if min_conf <= conf <= max_conf:
+                    box_color = conf_color
+                    break
+            # Draw bounding box
+            cv2.rectangle(image_cv, (x1, y1), (x2, y2), box_color, 1)
+    # Add legend
+    if add_legend:
+        add_confidence_legend(image_cv, confidence_ranges, show_model_replacement=True)
+    # Create second page with text overlay
+    text_page = np.ones((height, width, 3), dtype=np.uint8) * 255  # White background
+    # Process each line's words for text overlay
+    for line_key, line_data in ocr_results.items():
+        if not isinstance(line_data, dict) or 'words' not in line_data:
+            continue
+        words = line_data.get('words', [])
+        # Process each word in the line
+        for word_data in words:
+            if not isinstance(word_data, dict):
+                continue
+            text = word_data.get('text', '')
+            # Handle both 'conf' and 'confidence' field names for compatibility
+            conf = int(word_data.get('conf', word_data.get('confidence', 0)))
+            # Skip empty text or invalid confidence
+            if not text.strip() or conf == -1:
+                continue
+            # Get bounding box coordinates
+            bbox = word_data.get('bounding_box', (0, 0, 0, 0))
+            if len(bbox) != 4:
+                continue
+            x1, y1, x2, y2 = bbox
+            # Ensure coordinates are within image bounds
+            x1 = max(0, min(int(x1), width))
+            y1 = max(0, min(int(y1), height))
+            x2 = max(0, min(int(x2), width))
+            y2 = max(0, min(int(y2), height))
+            # Skip if bounding box is invalid
+            if x2 <= x1 or y2 <= y1:
+                continue
+            # Check if word was replaced by a different model (for reference, but text color always uses confidence)
+            model = word_data.get('model', None)
+            is_replaced = model and model != "Tesseract"
+            # Text color always based on confidence (not affected by model replacement)
+            text_color = (0, 0, 180)  # Default to dark red
+            for min_conf, max_conf, conf_color, _ in text_confidence_ranges:
+                if min_conf <= conf <= max_conf:
+                    text_color = conf_color
+                    break
+            # Calculate font size to fit text within bounding box
+            box_width = x2 - x1
+            box_height = y2 - y1
+            # Start with a reasonable font scale
+            font_scale = 0.5
+            font_thickness = 1
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            # Get text size and adjust to fit
+            (text_width, text_height), baseline = cv2.getTextSize(
+                text, font, font_scale, font_thickness
+            )
+            # Scale font to fit width (with some padding)
+            if text_width > 0:
+                width_scale = (box_width * 0.9) / text_width
+            else:
+                width_scale = 1.0
+            # Scale font to fit height (with some padding)
+            if text_height > 0:
+                height_scale = (box_height * 0.8) / text_height
+            else:
+                height_scale = 1.0
+            # Use the smaller scale to ensure text fits both dimensions
+            font_scale = min(font_scale * min(width_scale, height_scale), 2.0)  # Cap at 2.0
+            # Recalculate text size with adjusted font scale
+            (text_width, text_height), baseline = cv2.getTextSize(
+                text, font, font_scale, font_thickness
+            )
+            # Center text within bounding box
+            text_x = x1 + (box_width - text_width) // 2
+            text_y = y1 + (box_height + text_height) // 2  # Baseline adjustment
+            # Draw text
+            cv2.putText(
+                text_page,
+                text,
+                (text_x, text_y),
+                font,
+                font_scale,
+                text_color,
+                font_thickness,
+                cv2.LINE_AA
+            )
+            # Draw grey bounding box for replaced words on text page
+            if is_replaced:
+                box_color = (128, 128, 128)  # Grey for model replacements
+                cv2.rectangle(text_page, (x1, y1), (x2, y2), box_color, 1)
+    # Add legend to second page
+    if add_legend:
+        add_confidence_legend(text_page, text_confidence_ranges, show_model_replacement=True)
+    # Concatenate images horizontally
+    combined_image = np.hstack([image_cv, text_page])
+    # Save the visualization
+    if output_folder:
+        textract_viz_folder = os.path.join(output_folder, visualisation_folder)
+        # Double-check the constructed path is safe
+        if not validate_folder_containment(textract_viz_folder, OUTPUT_FOLDER):
+            raise ValueError(
+                f"Unsafe textract visualisations folder path: {textract_viz_folder}"
+            )
+        os.makedirs(textract_viz_folder, exist_ok=True)
+        # Generate filename
+        if image_name:
+            # Remove file extension if present
+            base_name = os.path.splitext(image_name)[0]
+            filename = f"{base_name}_{visualisation_folder}.jpg"
+        else:
+            timestamp = int(time.time())
+            filename = f"{visualisation_folder}_{timestamp}.jpg"
+        output_path = os.path.join(textract_viz_folder, filename)
+        # Save the combined image
+        cv2.imwrite(output_path, combined_image)
+        print(f"OCR visualization saved to: {output_path}")
+        log_files_output_paths.append(output_path)
+        return log_files_output_paths
+def add_confidence_legend(
+    image_cv: np.ndarray,
+    confidence_ranges: List[Tuple],
+    show_model_replacement: bool = False
+) -> None:
+    """
+    Adds a confidence legend to the visualization image.
+    Args:
+        image_cv: OpenCV image array
+        confidence_ranges: List of tuples containing (min_conf, max_conf, color, label)
+        show_model_replacement: Whether to include a legend entry for model replacements (grey)
+    """
+    height, width = image_cv.shape[:2]
+    # Calculate legend height based on number of items
+    num_items = len(confidence_ranges)
+    if show_model_replacement:
+        num_items += 1  # Add one more for model replacement entry
+    # Legend parameters
+    legend_width = 200
+    legend_height = 80 + (num_items * 25)  # Dynamic height based on number of items
+    legend_x = width - legend_width - 20
+    legend_y = 20
+    # Draw legend background
+    cv2.rectangle(
+        image_cv,
+        (legend_x, legend_y),
+        (legend_x + legend_width, legend_y + legend_height),
+        (255, 255, 255),  # White background
+        -1,
+    )
+    cv2.rectangle(
+        image_cv,
+        (legend_x, legend_y),
+        (legend_x + legend_width, legend_y + legend_height),
+        (0, 0, 0),  # Black border
+        2,
+    )
+    # Add title
+    title_text = "Confidence Levels"
+    font_scale = 0.6
+    font_thickness = 2
+    (title_width, title_height), _ = cv2.getTextSize(
+        title_text, cv2.FONT_HERSHEY_SIMPLEX, font_scale, font_thickness
+    )
+    title_x = legend_x + (legend_width - title_width) // 2
+    title_y = legend_y + title_height + 10
+    cv2.putText(
+        image_cv,
+        title_text,
+        (title_x, title_y),
+        cv2.FONT_HERSHEY_SIMPLEX,
+        font_scale,
+        (0, 0, 0),  # Black text
+        font_thickness,
+    )
+    # Add confidence range items
+    item_spacing = 25
+    start_y = title_y + 25
+    item_index = 0
+    # Add model replacement entry first if enabled
+    if show_model_replacement:
+        item_y = start_y + item_index * item_spacing
+        item_index += 1
+        # Draw grey color box
+        box_size = 15
+        box_x = legend_x + 10
+        box_y = item_y - box_size
+        replacement_color = (128, 128, 128)  # Grey in BGR
+        cv2.rectangle(
+            image_cv, (box_x, box_y), (box_x + box_size, box_y + box_size), replacement_color, -1
+        )
+        cv2.rectangle(
+            image_cv,
+            (box_x, box_y),
+            (box_x + box_size, box_y + box_size),
+            (0, 0, 0),  # Black border
+            1,
+        )
+        # Add label text
+        label_x = box_x + box_size + 10
+        label_y = item_y - 5
+        cv2.putText(
+            image_cv,
+            "Model Replacement",
+            (label_x, label_y),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 0, 0),  # Black text
+            1,
+        )
+    # Add confidence range items
+    for i, (min_conf, max_conf, color, label) in enumerate(confidence_ranges):
+        item_y = start_y + (item_index + i) * item_spacing
+        # Draw color box
+        box_size = 15
+        box_x = legend_x + 10
+        box_y = item_y - box_size
+        cv2.rectangle(
+            image_cv, (box_x, box_y), (box_x + box_size, box_y + box_size), color, -1
+        )
+        cv2.rectangle(
+            image_cv,
+            (box_x, box_y),
+            (box_x + box_size, box_y + box_size),
+            (0, 0, 0),  # Black border
+            1,
+        )
+        # Add label text
+        label_x = box_x + box_size + 10
+        label_y = item_y - 5
+        cv2.putText(
+            image_cv,
+            label,
+            (label_x, label_y),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 0, 0),  # Black text
+            1,
+        )

tools/run_vlm.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import os
+import sys
+import time
+from threading import Thread
+import spaces
+from PIL import Image
+from tools.config import SHOW_VLM_MODEL_OPTIONS, MAX_SPACES_GPU_RUN_TIME
+if SHOW_VLM_MODEL_OPTIONS is True:
+    import torch
+    from huggingface_hub import snapshot_download
+    from transformers import (
+        AutoModelForCausalLM,
+        AutoProcessor,
+        Qwen2_5_VLForConditionalGeneration,
+        Qwen3VLForConditionalGeneration,
+        TextIteratorStreamer,
+    )
+    from tools.config import (
+        SELECTED_MODEL,
+        USE_FLASH_ATTENTION,
+        MODEL_CACHE_PATH,
+    )
+    # Configuration: Choose which vision model to load
+    # Options: "olmOCR-2-7B-1025", "Nanonets-OCR2-3B", "Chandra-OCR", "Dots.OCR"
+    # SELECTED_MODEL = os.getenv("VISION_MODEL", "Dots.OCR")
+    # This code is uses significant amounts of code from the Hugging Face space here: https://huggingface.co/spaces/prithivMLmods/Multimodal-OCR3 . Thanks!
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+    print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
+    print("torch.__version__ =", torch.__version__)
+    print("torch.version.cuda =", torch.version.cuda)
+    print("cuda available:", torch.cuda.is_available())
+    print("cuda device count:", torch.cuda.device_count())
+    if torch.cuda.is_available():
+        print("current device:", torch.cuda.current_device())
+        print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
+    print("Using device:", device)
+    CACHE_PATH = MODEL_CACHE_PATH
+    if not os.path.exists(CACHE_PATH):
+        os.makedirs(CACHE_PATH)
+    # Initialize model and processor variables
+    processor = None
+    model = None
+    print(f"Loading vision model: {SELECTED_MODEL}")
+    # Load only the selected model based on configuration
+    if SELECTED_MODEL == "olmOCR-2-7B-1025":
+        MODEL_ID = "allenai/olmOCR-2-7B-1025"
+        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        model = (
+            Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
+            )
+            .to(device)
+            .eval()
+        )
+    elif SELECTED_MODEL == "Nanonets-OCR2-3B":
+        MODEL_ID = "nanonets/Nanonets-OCR2-3B"
+        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        model = (
+            Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
+            )
+            .to(device)
+            .eval()
+        )
+    elif SELECTED_MODEL == "Chandra-OCR":
+        MODEL_ID = "datalab-to/chandra"
+        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        model = (
+            Qwen3VLForConditionalGeneration.from_pretrained(
+                MODEL_ID, trust_remote_code=True, torch_dtype=torch.float16
+            )
+            .to(device)
+            .eval()
+        )
+    elif SELECTED_MODEL == "Dots.OCR":
+        # Download and patch Dots.OCR model
+        model_path_d_local = snapshot_download(
+            repo_id="rednote-hilab/dots.ocr",
+            local_dir=os.path.join(CACHE_PATH, "dots.ocr"),
+            max_workers=20,
+            local_dir_use_symlinks=False,
+        )
+        config_file_path = os.path.join(model_path_d_local, "configuration_dots.py")
+        if os.path.exists(config_file_path):
+            with open(config_file_path, "r") as f:
+                input_code = f.read()
+            lines = input_code.splitlines()
+            if "class DotsVLProcessor" in input_code and not any(
+                "attributes = " in line for line in lines
+            ):
+                output_lines = []
+                for line in lines:
+                    output_lines.append(line)
+                    if line.strip().startswith("class DotsVLProcessor"):
+                        output_lines.append(
+                            '    attributes = ["image_processor", "tokenizer"]'
+                        )
+                with open(config_file_path, "w") as f:
+                    f.write("\n".join(output_lines))
+                print("Patched configuration_dots.py successfully.")
+        sys.path.append(model_path_d_local)
+        if USE_FLASH_ATTENTION is True:
+            attn_implementation = "flash_attention_2"
+        else:
+            attn_implementation = "eager"
+        MODEL_ID = model_path_d_local
+        processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
+        model = AutoModelForCausalLM.from_pretrained(
+            MODEL_ID,
+            attn_implementation=attn_implementation,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            trust_remote_code=True,
+        ).eval()
+    else:
+        raise ValueError(
+            f"Invalid model selected: {SELECTED_MODEL}. Valid options are: olmOCR-2-7B-1025, Nanonets-OCR2-3B, Chandra-OCR, Dots.OCR"
+        )
+    print(f"Successfully loaded {SELECTED_MODEL}")
+@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
+def generate_image(
+    text: str,
+    image: Image.Image,
+    max_new_tokens: int,
+    temperature: float,
+    top_p: float,
+    top_k: int,
+    repetition_penalty: float,
+):
+    """
+    Generates responses using the configured vision model for image input.
+    Streams text to console and returns complete text only at the end.
+    """
+    if image is None:
+        return "Please upload an image."
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": text},
+            ],
+        }
+    ]
+    prompt_full = processor.apply_chat_template(
+        messages, tokenize=False, add_generation_prompt=True
+    )
+    inputs = processor(
+        text=[prompt_full], images=[image], return_tensors="pt", padding=True
+    ).to(device)
+    streamer = TextIteratorStreamer(
+        processor, skip_prompt=True, skip_special_tokens=True
+    )
+    generation_kwargs = {
+        **inputs,
+        "streamer": streamer,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True,
+        "temperature": temperature,
+        "top_p": top_p,
+        "top_k": top_k,
+        "repetition_penalty": repetition_penalty,
+    }
+    thread = Thread(target=model.generate, kwargs=generation_kwargs)
+    thread.start()
+    buffer = ""
+    for new_text in streamer:
+        buffer += new_text
+        buffer = buffer.replace("<|im_end|>", "")
+        # Print to console as it streams
+        print(new_text, end="", flush=True)
+        time.sleep(0.01)
+    # Print final newline after streaming is complete
+    print()  # Add newline at the end
+    # Return the complete text only at the end
+    return buffer