import os import torch import numpy as np from PIL import Image import spaces from transformers import AutoProcessor from qwen_vl_utils import process_vision_info from transformers import HunYuanVLForConditionalGeneration import gradio as gr from argparse import ArgumentParser import copy import requests from io import BytesIO import tempfile import hashlib import gc # Optimization: Set environment variables os.environ['TOKENIZERS_PARALLELISM'] = 'false' os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' # torch._C._jit_set_profiling_executor(False) # torch._C._jit_set_profiling_mode(False) def _get_args(): parser = ArgumentParser() parser.add_argument('-c', '--checkpoint-path', type=str, default='tencent/HunyuanOCR', help='Checkpoint name or path, default to %(default)r') parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only') parser.add_argument('--flash-attn2', action='store_true', default=False, help='Enable flash_attention_2 when loading the model.') parser.add_argument('--share', action='store_true', default=False, help='Create a publicly shareable link for the interface.') parser.add_argument('--inbrowser', action='store_true', default=False, help='Automatically launch the interface in a new tab on the default browser.') args = parser.parse_args() return args def _load_model_processor(args): # ZeroGPU: Model loads on CPU, uses eager mode # Automatically moves to GPU within @spaces.GPU decorator print(f"[INFO] Loading model (ZeroGPU uses eager mode)") print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}") model = HunYuanVLForConditionalGeneration.from_pretrained( args.checkpoint_path, attn_implementation="eager", # Required for ZeroGPU (starts on CPU) torch_dtype=torch.bfloat16, device_map="auto", # Let ZeroGPU manage device placement ) # Disable gradient checkpointing for faster inference if hasattr(model, 'gradient_checkpointing_disable'): model.gradient_checkpointing_disable() print(f"[INFO] Gradient checkpointing disabled") # Set to evaluation mode model.eval() print(f"[INFO] Model set to eval mode") processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True) print(f"[INFO] Model loaded, device: {next(model.parameters()).device}") return model, processor def _parse_text(text): """Parse text, handle special formatting""" # if text is None: # return text text = text.replace("", "").replace("", "") return text def _remove_image_special(text): """Remove image special tokens""" # if text is None: # return text # # Remove image special tokens # import re # text = re.sub(r'|||', '', text) # return text return text def _gc(): """Garbage collection""" gc.collect() if torch.cuda.is_available(): torch.cuda.empty_cache() def clean_repeated_substrings(text): """Clean repeated substrings in text""" n = len(text) if n < 2000: return text for length in range(2, n // 10 + 1): candidate = text[-length:] count = 0 i = n - length while i >= 0 and text[i:i + length] == candidate: count += 1 i -= length if count >= 10: return text[:n - length * (count - 1)] return text def _launch_demo(args, model, processor): # Track first call first_call = [True] # Uses closure to access model and processor # Duration increased to 120s to avoid timeout during peak hours @spaces.GPU(duration=120) def call_local_model(messages): import time import sys start_time = time.time() if first_call[0]: print(f"[INFO] ========== First inference call ==========") first_call[0] = False else: print(f"[INFO] ========== Subsequent inference call ==========") print(f"[DEBUG] ========== Starting inference ==========") print(f"[DEBUG] Python version: {sys.version}") print(f"[DEBUG] PyTorch version: {torch.__version__}") print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}") print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}") print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}") print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB") print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB") # Ensure model is on GPU model_device = next(model.parameters()).device print(f"[DEBUG] Model device: {model_device}") print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}") if str(model_device) == 'cpu': print(f"[ERROR] Model on CPU! Attempting to move to GPU...") if torch.cuda.is_available(): move_start = time.time() model.cuda() move_time = time.time() - move_start print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}") print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s") else: print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!") print(f"[CRITICAL] This may be due to ZeroGPU resource constraints") else: print(f"[INFO] Model already on GPU: {model_device}") messages = [messages] # Build input using processor texts = [ processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) for msg in messages ] print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s") image_inputs, video_inputs = process_vision_info(messages) print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s") # Check image input size if image_inputs: for idx, img in enumerate(image_inputs): if hasattr(img, 'size'): print(f"[DEBUG] Image {idx} size: {img.size}") elif isinstance(img, np.ndarray): print(f"[DEBUG] Image {idx} shape: {img.shape}") print(f"[DEBUG] Starting processor encoding...") processor_start = time.time() inputs = processor( text=texts, images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ) print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s") # Ensure inputs on GPU to_device_start = time.time() inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu') print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s") print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s") print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}") print(f"[DEBUG] Input device: {inputs.input_ids.device}") print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}") # Generation gen_start = time.time() print(f"[DEBUG] ========== Starting token generation ==========") # Optimized max_new_tokens for OCR tasks max_new_tokens = 2048 print(f"[DEBUG] max_new_tokens: {max_new_tokens}") # Progress callback token_count = [0] last_time = [gen_start] def progress_callback(input_ids, scores, **kwargs): token_count[0] += 1 current_time = time.time() if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0: elapsed = current_time - gen_start tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0 print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s") last_time[0] = current_time return False with torch.no_grad(): print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s") # Test forward pass print(f"[DEBUG] Testing forward pass...") forward_test_start = time.time() try: with torch.cuda.amp.autocast(dtype=torch.bfloat16): test_outputs = model(**inputs, use_cache=False) print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s") except Exception as e: print(f"[WARNING] Forward pass test failed: {e}") print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)") generate_call_start = time.time() try: # Deterministic generation generated_ids = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=False, temperature=0 ) print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s") except Exception as e: print(f"[ERROR] Generation failed: {e}") import traceback traceback.print_exc() raise print(f"[DEBUG] Exited torch.no_grad() context") gen_time = time.time() - gen_start print(f"[DEBUG] ========== Generation complete ==========") print(f"[DEBUG] Generation time: {gen_time:.2f}s") print(f"[DEBUG] Output shape: {generated_ids.shape}") # Decode output if "input_ids" in inputs: input_ids = inputs.input_ids else: input_ids = inputs.inputs generated_ids_trimmed = [ out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids) ] actual_tokens = len(generated_ids_trimmed[0]) print(f"[DEBUG] Actual tokens generated: {actual_tokens}") print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s") output_texts = processor.batch_decode( generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False ) total_time = time.time() - start_time print(f"[DEBUG] ========== All done ==========") print(f"[DEBUG] Total time: {total_time:.2f}s") print(f"[DEBUG] Output length: {len(output_texts[0])} chars") print(f"[DEBUG] Output preview: {output_texts[0][:100]}...") output_texts[0] = clean_repeated_substrings(output_texts[0]) return output_texts def create_predict_fn(): def predict(_chatbot, task_history): nonlocal model, processor chat_query = _chatbot[-1][0] query = task_history[-1][0] if len(chat_query) == 0: _chatbot.pop() task_history.pop() return _chatbot print('User: ', query) history_cp = copy.deepcopy(task_history) full_response = '' messages = [] content = [] for q, a in history_cp: if isinstance(q, (tuple, list)): # Check if URL or local path img_path = q[0] if img_path.startswith(('http://', 'https://')): content.append({'type': 'image', 'image': img_path}) else: content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'}) else: content.append({'type': 'text', 'text': q}) messages.append({'role': 'user', 'content': content}) messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]}) content = [] messages.pop() # Call model to get response response_list = call_local_model(messages) response = response_list[0] if response_list else "" _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response))) full_response = _parse_text(response) task_history[-1] = (query, full_response) print('HunyuanOCR: ' + _parse_text(full_response)) yield _chatbot return predict def create_regenerate_fn(): def regenerate(_chatbot, task_history): nonlocal model, processor if not task_history: return _chatbot item = task_history[-1] if item[1] is None: return _chatbot task_history[-1] = (item[0], None) chatbot_item = _chatbot.pop(-1) if chatbot_item[0] is None: _chatbot[-1] = (_chatbot[-1][0], None) else: _chatbot.append((chatbot_item[0], None)) # Use outer predict function _chatbot_gen = predict(_chatbot, task_history) for _chatbot in _chatbot_gen: yield _chatbot return regenerate predict = create_predict_fn() regenerate = create_regenerate_fn() def add_text(history, task_history, text): task_text = text history = history if history is not None else [] task_history = task_history if task_history is not None else [] history = history + [(_parse_text(text), None)] task_history = task_history + [(task_text, None)] return history, task_history, '' def add_file(history, task_history, file): history = history if history is not None else [] task_history = task_history if task_history is not None else [] history = history + [((file.name,), None)] task_history = task_history + [((file.name,), None)] return history, task_history def download_url_image(url): """Download URL image to local temp file""" try: # Use URL hash as filename to avoid duplicate downloads url_hash = hashlib.md5(url.encode()).hexdigest() temp_dir = tempfile.gettempdir() temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png") # Return cached file if exists if os.path.exists(temp_path): return temp_path # Download image response = requests.get(url, timeout=10) response.raise_for_status() with open(temp_path, 'wb') as f: f.write(response.content) return temp_path except Exception as e: print(f"Failed to download image: {url}, error: {e}") return url # Return original URL on failure def reset_user_input(): return gr.update(value='') def reset_state(_chatbot, task_history): task_history.clear() _chatbot.clear() _gc() return [] # Example image paths - local files EXAMPLE_IMAGES = { "spotting": "examples/spotting.jpg", "parsing": "examples/parsing.jpg", "ie": "examples/ie.jpg", "vqa": "examples/vqa.jpg", "translation": "examples/translation.jpg" } with gr.Blocks() as demo: # Header gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*") with gr.Column(): # Chat area chatbot = gr.Chatbot( label='Chat', height=600, bubble_full_width=False, layout="bubble", show_copy_button=True, ) # Input panel with gr.Group(): query = gr.Textbox( lines=2, label='Enter your question', placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.', show_label=False ) with gr.Row(): addfile_btn = gr.UploadButton('Upload Image', file_types=['image']) submit_btn = gr.Button('Send', variant="primary", scale=3) regen_btn = gr.Button('Regenerate') empty_bin = gr.Button('Clear') # Examples section gr.Markdown("### Quick Examples - Click to load") with gr.Row(): example_1_btn = gr.Button("Text Detection") example_2_btn = gr.Button("Document Parsing") example_3_btn = gr.Button("Info Extraction") example_4_btn = gr.Button("Visual Q&A") example_5_btn = gr.Button("Translation") task_history = gr.State([]) # Example 1: Text Detection def load_example_1(history, task_hist): prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates." image_path = EXAMPLE_IMAGES["spotting"] history = [((image_path,), None)] task_hist = [((image_path,), None)] return history, task_hist, prompt # Example 2: Document Parsing def load_example_2(history, task_hist): prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order." image_path = EXAMPLE_IMAGES["parsing"] history = [((image_path,), None)] task_hist = [((image_path,), None)] return history, task_hist, prompt # Example 3: Information Extraction def load_example_3(history, task_hist): prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']" image_path = EXAMPLE_IMAGES["ie"] history = [((image_path,), None)] task_hist = [((image_path,), None)] return history, task_hist, prompt # Example 4: Visual Q&A def load_example_4(history, task_hist): prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?" image_path = EXAMPLE_IMAGES["vqa"] history = [((image_path,), None)] task_hist = [((image_path,), None)] return history, task_hist, prompt # Example 5: Translation def load_example_5(history, task_hist): prompt = "Translate all text in this image to English." image_path = EXAMPLE_IMAGES["translation"] history = [((image_path,), None)] task_hist = [((image_path,), None)] return history, task_hist, prompt # Bind events example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query]) example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query]) example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query]) example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query]) example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query]) submit_btn.click(add_text, [chatbot, task_history, query], [chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True) submit_btn.click(reset_user_input, [], [query]) empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True) regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True) addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True) # Feature descriptions with gr.Row(): with gr.Column(scale=1): gr.Markdown(""" ### Core Features - **Text Detection & Recognition** - Multi-scene text detection and recognition - **Document Parsing** - Automatic document structure recognition - **Information Extraction** - Extract structured data from receipts and forms - **Visual Q&A** - Text-centric open-ended question answering - **Translation** - Translate text in images across 14+ languages """) with gr.Column(scale=1): gr.Markdown(""" ### Usage Tips - **Inference** - For production, use VLLM for better performance - **Image Quality** - Ensure images are clear, well-lit, and not heavily skewed - **File Size** - Recommended max 10MB per image, JPG/PNG format - **Use Cases** - OCR, document digitization, receipt recognition, translation """) # Footer gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*") demo.queue().launch( share=args.share, inbrowser=args.inbrowser, # server_port=args.server_port, # server_name=args.server_name, ) def main(): args = _get_args() model, processor = _load_model_processor(args) _launch_demo(args, model, processor) if __name__ == '__main__': main()