Spaces:
Running
Running
| import os | |
| import torch | |
| import numpy as np | |
| from PIL import Image | |
| import spaces | |
| from transformers import AutoProcessor | |
| from qwen_vl_utils import process_vision_info | |
| from transformers import HunYuanVLForConditionalGeneration | |
| import gradio as gr | |
| from argparse import ArgumentParser | |
| import copy | |
| import requests | |
| from io import BytesIO | |
| import tempfile | |
| import hashlib | |
| import gc | |
| # Optimization: Set environment variables | |
| os.environ['TOKENIZERS_PARALLELISM'] = 'false' | |
| os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1' | |
| # torch._C._jit_set_profiling_executor(False) | |
| # torch._C._jit_set_profiling_mode(False) | |
| def _get_args(): | |
| parser = ArgumentParser() | |
| parser.add_argument('-c', | |
| '--checkpoint-path', | |
| type=str, | |
| default='tencent/HunyuanOCR', | |
| help='Checkpoint name or path, default to %(default)r') | |
| parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only') | |
| parser.add_argument('--flash-attn2', | |
| action='store_true', | |
| default=False, | |
| help='Enable flash_attention_2 when loading the model.') | |
| parser.add_argument('--share', | |
| action='store_true', | |
| default=False, | |
| help='Create a publicly shareable link for the interface.') | |
| parser.add_argument('--inbrowser', | |
| action='store_true', | |
| default=False, | |
| help='Automatically launch the interface in a new tab on the default browser.') | |
| args = parser.parse_args() | |
| return args | |
| def _load_model_processor(args): | |
| # ZeroGPU: Model loads on CPU, uses eager mode | |
| # Automatically moves to GPU within @spaces.GPU decorator | |
| print(f"[INFO] Loading model (ZeroGPU uses eager mode)") | |
| print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}") | |
| model = HunYuanVLForConditionalGeneration.from_pretrained( | |
| args.checkpoint_path, | |
| attn_implementation="eager", # Required for ZeroGPU (starts on CPU) | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto", # Let ZeroGPU manage device placement | |
| ) | |
| # Disable gradient checkpointing for faster inference | |
| if hasattr(model, 'gradient_checkpointing_disable'): | |
| model.gradient_checkpointing_disable() | |
| print(f"[INFO] Gradient checkpointing disabled") | |
| # Set to evaluation mode | |
| model.eval() | |
| print(f"[INFO] Model set to eval mode") | |
| processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True) | |
| print(f"[INFO] Model loaded, device: {next(model.parameters()).device}") | |
| return model, processor | |
| def _parse_text(text): | |
| """Parse text, handle special formatting""" | |
| # if text is None: | |
| # return text | |
| text = text.replace("<trans>", "").replace("</trans>", "") | |
| return text | |
| def _remove_image_special(text): | |
| """Remove image special tokens""" | |
| # if text is None: | |
| # return text | |
| # # Remove image special tokens | |
| # import re | |
| # text = re.sub(r'<image>|</image>|<img>|</img>', '', text) | |
| # return text | |
| return text | |
| def _gc(): | |
| """Garbage collection""" | |
| gc.collect() | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| def clean_repeated_substrings(text): | |
| """Clean repeated substrings in text""" | |
| n = len(text) | |
| if n < 2000: | |
| return text | |
| for length in range(2, n // 10 + 1): | |
| candidate = text[-length:] | |
| count = 0 | |
| i = n - length | |
| while i >= 0 and text[i:i + length] == candidate: | |
| count += 1 | |
| i -= length | |
| if count >= 10: | |
| return text[:n - length * (count - 1)] | |
| return text | |
| def _launch_demo(args, model, processor): | |
| # Track first call | |
| first_call = [True] | |
| # Uses closure to access model and processor | |
| # Duration increased to 120s to avoid timeout during peak hours | |
| def call_local_model(messages): | |
| import time | |
| import sys | |
| start_time = time.time() | |
| if first_call[0]: | |
| print(f"[INFO] ========== First inference call ==========") | |
| first_call[0] = False | |
| else: | |
| print(f"[INFO] ========== Subsequent inference call ==========") | |
| print(f"[DEBUG] ========== Starting inference ==========") | |
| print(f"[DEBUG] Python version: {sys.version}") | |
| print(f"[DEBUG] PyTorch version: {torch.__version__}") | |
| print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}") | |
| if torch.cuda.is_available(): | |
| print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}") | |
| print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}") | |
| print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}") | |
| print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB") | |
| print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB") | |
| # Ensure model is on GPU | |
| model_device = next(model.parameters()).device | |
| print(f"[DEBUG] Model device: {model_device}") | |
| print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}") | |
| if str(model_device) == 'cpu': | |
| print(f"[ERROR] Model on CPU! Attempting to move to GPU...") | |
| if torch.cuda.is_available(): | |
| move_start = time.time() | |
| model.cuda() | |
| move_time = time.time() - move_start | |
| print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}") | |
| print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s") | |
| else: | |
| print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!") | |
| print(f"[CRITICAL] This may be due to ZeroGPU resource constraints") | |
| else: | |
| print(f"[INFO] Model already on GPU: {model_device}") | |
| messages = [messages] | |
| # Build input using processor | |
| texts = [ | |
| processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True) | |
| for msg in messages | |
| ] | |
| print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s") | |
| image_inputs, video_inputs = process_vision_info(messages) | |
| print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s") | |
| # Check image input size | |
| if image_inputs: | |
| for idx, img in enumerate(image_inputs): | |
| if hasattr(img, 'size'): | |
| print(f"[DEBUG] Image {idx} size: {img.size}") | |
| elif isinstance(img, np.ndarray): | |
| print(f"[DEBUG] Image {idx} shape: {img.shape}") | |
| print(f"[DEBUG] Starting processor encoding...") | |
| processor_start = time.time() | |
| inputs = processor( | |
| text=texts, | |
| images=image_inputs, | |
| videos=video_inputs, | |
| padding=True, | |
| return_tensors="pt", | |
| ) | |
| print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s") | |
| # Ensure inputs on GPU | |
| to_device_start = time.time() | |
| inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu') | |
| print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s") | |
| print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s") | |
| print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}") | |
| print(f"[DEBUG] Input device: {inputs.input_ids.device}") | |
| print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}") | |
| # Generation | |
| gen_start = time.time() | |
| print(f"[DEBUG] ========== Starting token generation ==========") | |
| # Optimized max_new_tokens for OCR tasks | |
| max_new_tokens = 2048 | |
| print(f"[DEBUG] max_new_tokens: {max_new_tokens}") | |
| # Progress callback | |
| token_count = [0] | |
| last_time = [gen_start] | |
| def progress_callback(input_ids, scores, **kwargs): | |
| token_count[0] += 1 | |
| current_time = time.time() | |
| if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0: | |
| elapsed = current_time - gen_start | |
| tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0 | |
| print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s") | |
| last_time[0] = current_time | |
| return False | |
| with torch.no_grad(): | |
| print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s") | |
| # Test forward pass | |
| print(f"[DEBUG] Testing forward pass...") | |
| forward_test_start = time.time() | |
| try: | |
| with torch.cuda.amp.autocast(dtype=torch.bfloat16): | |
| test_outputs = model(**inputs, use_cache=False) | |
| print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s") | |
| except Exception as e: | |
| print(f"[WARNING] Forward pass test failed: {e}") | |
| print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)") | |
| generate_call_start = time.time() | |
| try: | |
| # Deterministic generation | |
| generated_ids = model.generate( | |
| **inputs, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=False, | |
| temperature=0 | |
| ) | |
| print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s") | |
| except Exception as e: | |
| print(f"[ERROR] Generation failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| raise | |
| print(f"[DEBUG] Exited torch.no_grad() context") | |
| gen_time = time.time() - gen_start | |
| print(f"[DEBUG] ========== Generation complete ==========") | |
| print(f"[DEBUG] Generation time: {gen_time:.2f}s") | |
| print(f"[DEBUG] Output shape: {generated_ids.shape}") | |
| # Decode output | |
| if "input_ids" in inputs: | |
| input_ids = inputs.input_ids | |
| else: | |
| input_ids = inputs.inputs | |
| generated_ids_trimmed = [ | |
| out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids) | |
| ] | |
| actual_tokens = len(generated_ids_trimmed[0]) | |
| print(f"[DEBUG] Actual tokens generated: {actual_tokens}") | |
| print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s") | |
| output_texts = processor.batch_decode( | |
| generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False | |
| ) | |
| total_time = time.time() - start_time | |
| print(f"[DEBUG] ========== All done ==========") | |
| print(f"[DEBUG] Total time: {total_time:.2f}s") | |
| print(f"[DEBUG] Output length: {len(output_texts[0])} chars") | |
| print(f"[DEBUG] Output preview: {output_texts[0][:100]}...") | |
| output_texts[0] = clean_repeated_substrings(output_texts[0]) | |
| return output_texts | |
| def create_predict_fn(): | |
| def predict(_chatbot, task_history): | |
| nonlocal model, processor | |
| chat_query = _chatbot[-1][0] | |
| query = task_history[-1][0] | |
| if len(chat_query) == 0: | |
| _chatbot.pop() | |
| task_history.pop() | |
| return _chatbot | |
| print('User: ', query) | |
| history_cp = copy.deepcopy(task_history) | |
| full_response = '' | |
| messages = [] | |
| content = [] | |
| for q, a in history_cp: | |
| if isinstance(q, (tuple, list)): | |
| # Check if URL or local path | |
| img_path = q[0] | |
| if img_path.startswith(('http://', 'https://')): | |
| content.append({'type': 'image', 'image': img_path}) | |
| else: | |
| content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'}) | |
| else: | |
| content.append({'type': 'text', 'text': q}) | |
| messages.append({'role': 'user', 'content': content}) | |
| messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]}) | |
| content = [] | |
| messages.pop() | |
| # Call model to get response | |
| response_list = call_local_model(messages) | |
| response = response_list[0] if response_list else "" | |
| _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response))) | |
| full_response = _parse_text(response) | |
| task_history[-1] = (query, full_response) | |
| print('HunyuanOCR: ' + _parse_text(full_response)) | |
| yield _chatbot | |
| return predict | |
| def create_regenerate_fn(): | |
| def regenerate(_chatbot, task_history): | |
| nonlocal model, processor | |
| if not task_history: | |
| return _chatbot | |
| item = task_history[-1] | |
| if item[1] is None: | |
| return _chatbot | |
| task_history[-1] = (item[0], None) | |
| chatbot_item = _chatbot.pop(-1) | |
| if chatbot_item[0] is None: | |
| _chatbot[-1] = (_chatbot[-1][0], None) | |
| else: | |
| _chatbot.append((chatbot_item[0], None)) | |
| # Use outer predict function | |
| _chatbot_gen = predict(_chatbot, task_history) | |
| for _chatbot in _chatbot_gen: | |
| yield _chatbot | |
| return regenerate | |
| predict = create_predict_fn() | |
| regenerate = create_regenerate_fn() | |
| def add_text(history, task_history, text): | |
| task_text = text | |
| history = history if history is not None else [] | |
| task_history = task_history if task_history is not None else [] | |
| history = history + [(_parse_text(text), None)] | |
| task_history = task_history + [(task_text, None)] | |
| return history, task_history, '' | |
| def add_file(history, task_history, file): | |
| history = history if history is not None else [] | |
| task_history = task_history if task_history is not None else [] | |
| history = history + [((file.name,), None)] | |
| task_history = task_history + [((file.name,), None)] | |
| return history, task_history | |
| def download_url_image(url): | |
| """Download URL image to local temp file""" | |
| try: | |
| # Use URL hash as filename to avoid duplicate downloads | |
| url_hash = hashlib.md5(url.encode()).hexdigest() | |
| temp_dir = tempfile.gettempdir() | |
| temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png") | |
| # Return cached file if exists | |
| if os.path.exists(temp_path): | |
| return temp_path | |
| # Download image | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| with open(temp_path, 'wb') as f: | |
| f.write(response.content) | |
| return temp_path | |
| except Exception as e: | |
| print(f"Failed to download image: {url}, error: {e}") | |
| return url # Return original URL on failure | |
| def reset_user_input(): | |
| return gr.update(value='') | |
| def reset_state(_chatbot, task_history): | |
| task_history.clear() | |
| _chatbot.clear() | |
| _gc() | |
| return [] | |
| # Example image paths - local files | |
| EXAMPLE_IMAGES = { | |
| "spotting": "examples/spotting.jpg", | |
| "parsing": "examples/parsing.jpg", | |
| "ie": "examples/ie.jpg", | |
| "vqa": "examples/vqa.jpg", | |
| "translation": "examples/translation.jpg" | |
| } | |
| with gr.Blocks() as demo: | |
| # Header | |
| gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*") | |
| with gr.Column(): | |
| # Chat area | |
| chatbot = gr.Chatbot( | |
| label='Chat', | |
| height=600, | |
| bubble_full_width=False, | |
| layout="bubble", | |
| show_copy_button=True, | |
| ) | |
| # Input panel | |
| with gr.Group(): | |
| query = gr.Textbox( | |
| lines=2, | |
| label='Enter your question', | |
| placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.', | |
| show_label=False | |
| ) | |
| with gr.Row(): | |
| addfile_btn = gr.UploadButton('Upload Image', file_types=['image']) | |
| submit_btn = gr.Button('Send', variant="primary", scale=3) | |
| regen_btn = gr.Button('Regenerate') | |
| empty_bin = gr.Button('Clear') | |
| # Examples section | |
| gr.Markdown("### Quick Examples - Click to load") | |
| with gr.Row(): | |
| example_1_btn = gr.Button("Text Detection") | |
| example_2_btn = gr.Button("Document Parsing") | |
| example_3_btn = gr.Button("Info Extraction") | |
| example_4_btn = gr.Button("Visual Q&A") | |
| example_5_btn = gr.Button("Translation") | |
| task_history = gr.State([]) | |
| # Example 1: Text Detection | |
| def load_example_1(history, task_hist): | |
| prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates." | |
| image_path = EXAMPLE_IMAGES["spotting"] | |
| history = [((image_path,), None)] | |
| task_hist = [((image_path,), None)] | |
| return history, task_hist, prompt | |
| # Example 2: Document Parsing | |
| def load_example_2(history, task_hist): | |
| prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order." | |
| image_path = EXAMPLE_IMAGES["parsing"] | |
| history = [((image_path,), None)] | |
| task_hist = [((image_path,), None)] | |
| return history, task_hist, prompt | |
| # Example 3: Information Extraction | |
| def load_example_3(history, task_hist): | |
| prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']" | |
| image_path = EXAMPLE_IMAGES["ie"] | |
| history = [((image_path,), None)] | |
| task_hist = [((image_path,), None)] | |
| return history, task_hist, prompt | |
| # Example 4: Visual Q&A | |
| def load_example_4(history, task_hist): | |
| prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?" | |
| image_path = EXAMPLE_IMAGES["vqa"] | |
| history = [((image_path,), None)] | |
| task_hist = [((image_path,), None)] | |
| return history, task_hist, prompt | |
| # Example 5: Translation | |
| def load_example_5(history, task_hist): | |
| prompt = "Translate all text in this image to English." | |
| image_path = EXAMPLE_IMAGES["translation"] | |
| history = [((image_path,), None)] | |
| task_hist = [((image_path,), None)] | |
| return history, task_hist, prompt | |
| # Bind events | |
| example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query]) | |
| example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query]) | |
| example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query]) | |
| example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query]) | |
| example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query]) | |
| submit_btn.click(add_text, [chatbot, task_history, query], | |
| [chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True) | |
| submit_btn.click(reset_user_input, [], [query]) | |
| empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True) | |
| regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True) | |
| addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True) | |
| # Feature descriptions | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| ### Core Features | |
| - **Text Detection & Recognition** - Multi-scene text detection and recognition | |
| - **Document Parsing** - Automatic document structure recognition | |
| - **Information Extraction** - Extract structured data from receipts and forms | |
| - **Visual Q&A** - Text-centric open-ended question answering | |
| - **Translation** - Translate text in images across 14+ languages | |
| """) | |
| with gr.Column(scale=1): | |
| gr.Markdown(""" | |
| ### Usage Tips | |
| - **Inference** - For production, use VLLM for better performance | |
| - **Image Quality** - Ensure images are clear, well-lit, and not heavily skewed | |
| - **File Size** - Recommended max 10MB per image, JPG/PNG format | |
| - **Use Cases** - OCR, document digitization, receipt recognition, translation | |
| """) | |
| # Footer | |
| gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*") | |
| demo.queue().launch( | |
| share=args.share, | |
| inbrowser=args.inbrowser, | |
| # server_port=args.server_port, | |
| # server_name=args.server_name, | |
| ) | |
| def main(): | |
| args = _get_args() | |
| model, processor = _load_model_processor(args) | |
| _launch_demo(args, model, processor) | |
| if __name__ == '__main__': | |
| main() |