HunyuanOCR-ENGLISH

Running

File size: 22,106 Bytes

import os
import torch
import numpy as np
from PIL import Image
import spaces
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers import HunYuanVLForConditionalGeneration
import gradio as gr
from argparse import ArgumentParser
import copy
import requests
from io import BytesIO
import tempfile
import hashlib
import gc

# Optimization: Set environment variables
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
# torch._C._jit_set_profiling_executor(False)
# torch._C._jit_set_profiling_mode(False)





def _get_args():
    parser = ArgumentParser()

    parser.add_argument('-c',
                        '--checkpoint-path',
                        type=str,
                        default='tencent/HunyuanOCR',
                        help='Checkpoint name or path, default to %(default)r')
    parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')

    parser.add_argument('--flash-attn2',
                        action='store_true',
                        default=False,
                        help='Enable flash_attention_2 when loading the model.')
    parser.add_argument('--share',
                        action='store_true',
                        default=False,
                        help='Create a publicly shareable link for the interface.')
    parser.add_argument('--inbrowser',
                        action='store_true',
                        default=False,
                        help='Automatically launch the interface in a new tab on the default browser.')
    

    args = parser.parse_args()
    return args


def _load_model_processor(args):
    # ZeroGPU: Model loads on CPU, uses eager mode
    # Automatically moves to GPU within @spaces.GPU decorator
    print(f"[INFO] Loading model (ZeroGPU uses eager mode)")
    print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}")
    
    model = HunYuanVLForConditionalGeneration.from_pretrained(
        args.checkpoint_path,
        attn_implementation="eager",  # Required for ZeroGPU (starts on CPU)
        torch_dtype=torch.bfloat16,
        device_map="auto",  # Let ZeroGPU manage device placement
    )

    # Disable gradient checkpointing for faster inference
    if hasattr(model, 'gradient_checkpointing_disable'):
        model.gradient_checkpointing_disable()
        print(f"[INFO] Gradient checkpointing disabled")

    # Set to evaluation mode
    model.eval()
    print(f"[INFO] Model set to eval mode")
    
    processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
    
    print(f"[INFO] Model loaded, device: {next(model.parameters()).device}")
    return model, processor


def _parse_text(text):
    """Parse text, handle special formatting"""
    # if text is None:
    #     return text
    text = text.replace("<trans>", "").replace("</trans>", "")
    return text


def _remove_image_special(text):
    """Remove image special tokens"""
    # if text is None:
    #     return text
    # # Remove image special tokens
    # import re
    # text = re.sub(r'<image>|</image>|<img>|</img>', '', text)
    # return text
    return text


def _gc():
    """Garbage collection"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def clean_repeated_substrings(text):
    """Clean repeated substrings in text"""
    n = len(text)
    if n < 2000:
        return text
    for length in range(2, n // 10 + 1):
        candidate = text[-length:] 
        count = 0
        i = n - length
        
        while i >= 0 and text[i:i + length] == candidate:
            count += 1
            i -= length

        if count >= 10:
            return text[:n - length * (count - 1)]  

    return text


def _launch_demo(args, model, processor):
    # Track first call
    first_call = [True]

    # Uses closure to access model and processor
    # Duration increased to 120s to avoid timeout during peak hours
    @spaces.GPU(duration=120)
    def call_local_model(messages):
        import time
        import sys
        start_time = time.time()
        
        if first_call[0]:
            print(f"[INFO] ========== First inference call ==========")
            first_call[0] = False
        else:
            print(f"[INFO] ========== Subsequent inference call ==========")

        print(f"[DEBUG] ========== Starting inference ==========")
        print(f"[DEBUG] Python version: {sys.version}")
        print(f"[DEBUG] PyTorch version: {torch.__version__}")
        print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
            print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}")
            print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
            print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
            print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
        
        # Ensure model is on GPU
        model_device = next(model.parameters()).device
        print(f"[DEBUG] Model device: {model_device}")
        print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")

        if str(model_device) == 'cpu':
            print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
            if torch.cuda.is_available():
                move_start = time.time()
                model.cuda()
                move_time = time.time() - move_start
                print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
                print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
            else:
                print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
                print(f"[CRITICAL] This may be due to ZeroGPU resource constraints")
        else:
            print(f"[INFO] Model already on GPU: {model_device}")
        
        messages = [messages]
        
        # Build input using processor
        texts = [
            processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
            for msg in messages
        ]
        print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s")

        image_inputs, video_inputs = process_vision_info(messages)
        print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")

        # Check image input size
        if image_inputs:
            for idx, img in enumerate(image_inputs):
                if hasattr(img, 'size'):
                    print(f"[DEBUG] Image {idx} size: {img.size}")
                elif isinstance(img, np.ndarray):
                    print(f"[DEBUG] Image {idx} shape: {img.shape}")
        
        print(f"[DEBUG] Starting processor encoding...")
        processor_start = time.time()
        inputs = processor(
            text=texts,
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")

        # Ensure inputs on GPU
        to_device_start = time.time()
        inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s")
        print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s")
        print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
        print(f"[DEBUG] Input device: {inputs.input_ids.device}")
        print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
        
        # Generation
        gen_start = time.time()
        print(f"[DEBUG] ========== Starting token generation ==========")

        # Optimized max_new_tokens for OCR tasks
        max_new_tokens = 2048
        print(f"[DEBUG] max_new_tokens: {max_new_tokens}")

        # Progress callback
        token_count = [0]
        last_time = [gen_start]
        
        def progress_callback(input_ids, scores, **kwargs):
            token_count[0] += 1
            current_time = time.time()
            if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
                elapsed = current_time - gen_start
                tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
                print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s")
                last_time[0] = current_time
            return False
        
        with torch.no_grad():
            print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s")

            # Test forward pass
            print(f"[DEBUG] Testing forward pass...")
            forward_test_start = time.time()
            try:
                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    test_outputs = model(**inputs, use_cache=False)
                print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
            except Exception as e:
                print(f"[WARNING] Forward pass test failed: {e}")

            print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
            generate_call_start = time.time()
            
            try:
                # Deterministic generation
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                    temperature=0
                )
                print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
            except Exception as e:
                print(f"[ERROR] Generation failed: {e}")
                import traceback
                traceback.print_exc()
                raise
        
        print(f"[DEBUG] Exited torch.no_grad() context")

        gen_time = time.time() - gen_start
        print(f"[DEBUG] ========== Generation complete ==========")
        print(f"[DEBUG] Generation time: {gen_time:.2f}s")
        print(f"[DEBUG] Output shape: {generated_ids.shape}")

        # Decode output
        if "input_ids" in inputs:
            input_ids = inputs.input_ids
        else:
            input_ids = inputs.inputs

        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
        ]
        
        actual_tokens = len(generated_ids_trimmed[0])
        print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
        print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")

        output_texts = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        
        total_time = time.time() - start_time
        print(f"[DEBUG] ========== All done ==========")
        print(f"[DEBUG] Total time: {total_time:.2f}s")
        print(f"[DEBUG] Output length: {len(output_texts[0])} chars")
        print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
        output_texts[0] = clean_repeated_substrings(output_texts[0])
        return output_texts
    

    def create_predict_fn():

        def predict(_chatbot, task_history):
            nonlocal model, processor
            chat_query = _chatbot[-1][0]
            query = task_history[-1][0]
            if len(chat_query) == 0:
                _chatbot.pop()
                task_history.pop()
                return _chatbot
            print('User: ', query)
            history_cp = copy.deepcopy(task_history)
            full_response = ''
            messages = []
            content = []
            for q, a in history_cp:
                if isinstance(q, (tuple, list)):
                    # Check if URL or local path
                    img_path = q[0]
                    if img_path.startswith(('http://', 'https://')):
                        content.append({'type': 'image', 'image': img_path})
                    else:
                        content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
                else:
                    content.append({'type': 'text', 'text': q})
                    messages.append({'role': 'user', 'content': content})
                    messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
                    content = []
            messages.pop()
            
            # Call model to get response
            response_list = call_local_model(messages)
            response = response_list[0] if response_list else ""
            
            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
            full_response = _parse_text(response)

            task_history[-1] = (query, full_response)
            print('HunyuanOCR: ' + _parse_text(full_response))
            yield _chatbot

        return predict
    
    def create_regenerate_fn():

        def regenerate(_chatbot, task_history):
            nonlocal model, processor
            if not task_history:
                return _chatbot
            item = task_history[-1]
            if item[1] is None:
                return _chatbot
            task_history[-1] = (item[0], None)
            chatbot_item = _chatbot.pop(-1)
            if chatbot_item[0] is None:
                _chatbot[-1] = (_chatbot[-1][0], None)
            else:
                _chatbot.append((chatbot_item[0], None))
            # Use outer predict function
            _chatbot_gen = predict(_chatbot, task_history)
            for _chatbot in _chatbot_gen:
                yield _chatbot

        return regenerate

    predict = create_predict_fn()
    regenerate = create_regenerate_fn()

    def add_text(history, task_history, text):
        task_text = text
        history = history if history is not None else []
        task_history = task_history if task_history is not None else []
        history = history + [(_parse_text(text), None)]
        task_history = task_history + [(task_text, None)]
        return history, task_history, ''

    def add_file(history, task_history, file):
        history = history if history is not None else []
        task_history = task_history if task_history is not None else []
        history = history + [((file.name,), None)]
        task_history = task_history + [((file.name,), None)]
        return history, task_history
    
    def download_url_image(url):
        """Download URL image to local temp file"""
        try:
            # Use URL hash as filename to avoid duplicate downloads
            url_hash = hashlib.md5(url.encode()).hexdigest()
            temp_dir = tempfile.gettempdir()
            temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")

            # Return cached file if exists
            if os.path.exists(temp_path):
                return temp_path

            # Download image
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            with open(temp_path, 'wb') as f:
                f.write(response.content)
            return temp_path
        except Exception as e:
            print(f"Failed to download image: {url}, error: {e}")
            return url  # Return original URL on failure

    def reset_user_input():
        return gr.update(value='')

    def reset_state(_chatbot, task_history):
        task_history.clear()
        _chatbot.clear()
        _gc()
        return []

    # Example image paths - local files
    EXAMPLE_IMAGES = {
        "spotting": "examples/spotting.jpg",
        "parsing": "examples/parsing.jpg",
        "ie": "examples/ie.jpg",
        "vqa": "examples/vqa.jpg",
        "translation": "examples/translation.jpg"
    }

    with gr.Blocks() as demo:
        # Header
        gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*")
        
        with gr.Column():
            # Chat area
            chatbot = gr.Chatbot(
                label='Chat',
                height=600,
                bubble_full_width=False,
                layout="bubble",
                show_copy_button=True,
            )

            # Input panel
            with gr.Group():
                query = gr.Textbox(
                    lines=2,
                    label='Enter your question',
                    placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.',
                    show_label=False
                )

                with gr.Row():
                    addfile_btn = gr.UploadButton('Upload Image', file_types=['image'])
                    submit_btn = gr.Button('Send', variant="primary", scale=3)
                    regen_btn = gr.Button('Regenerate')
                    empty_bin = gr.Button('Clear')
            
            # Examples section
            gr.Markdown("### Quick Examples - Click to load")

            with gr.Row():
                example_1_btn = gr.Button("Text Detection")
                example_2_btn = gr.Button("Document Parsing")
                example_3_btn = gr.Button("Info Extraction")
                example_4_btn = gr.Button("Visual Q&A")
                example_5_btn = gr.Button("Translation")
        
        task_history = gr.State([])
        
        
        # Example 1: Text Detection
        def load_example_1(history, task_hist):
            prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
            image_path = EXAMPLE_IMAGES["spotting"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 2: Document Parsing
        def load_example_2(history, task_hist):
            prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order."
            image_path = EXAMPLE_IMAGES["parsing"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 3: Information Extraction
        def load_example_3(history, task_hist):
            prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
            image_path = EXAMPLE_IMAGES["ie"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 4: Visual Q&A
        def load_example_4(history, task_hist):
            prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
            image_path = EXAMPLE_IMAGES["vqa"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 5: Translation
        def load_example_5(history, task_hist):
            prompt = "Translate all text in this image to English."
            image_path = EXAMPLE_IMAGES["translation"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt
        
        # Bind events
        example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
        example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
        example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
        example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
        example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])
        
        submit_btn.click(add_text, [chatbot, task_history, query],
                         [chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
        submit_btn.click(reset_user_input, [], [query])
        empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)

        # Feature descriptions
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("""
### Core Features
- **Text Detection & Recognition** - Multi-scene text detection and recognition
- **Document Parsing** - Automatic document structure recognition
- **Information Extraction** - Extract structured data from receipts and forms
- **Visual Q&A** - Text-centric open-ended question answering
- **Translation** - Translate text in images across 14+ languages
                """)

            with gr.Column(scale=1):
                gr.Markdown("""
### Usage Tips
- **Inference** - For production, use VLLM for better performance
- **Image Quality** - Ensure images are clear, well-lit, and not heavily skewed
- **File Size** - Recommended max 10MB per image, JPG/PNG format
- **Use Cases** - OCR, document digitization, receipt recognition, translation
                """)

        # Footer
        gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*")

    demo.queue().launch(
        share=args.share,
        inbrowser=args.inbrowser,
        # server_port=args.server_port,
        # server_name=args.server_name,
    )


def main():
    args = _get_args()
    model, processor = _load_model_processor(args)
    _launch_demo(args, model, processor)


if __name__ == '__main__':
    main()