import os
import torch
import numpy as np
from PIL import Image
import spaces
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers import HunYuanVLForConditionalGeneration
import gradio as gr
from argparse import ArgumentParser
import copy
import requests
from io import BytesIO
import tempfile
import hashlib
import gc
# Optimization: Set environment variables
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
# torch._C._jit_set_profiling_executor(False)
# torch._C._jit_set_profiling_mode(False)
def _get_args():
parser = ArgumentParser()
parser.add_argument('-c',
'--checkpoint-path',
type=str,
default='tencent/HunyuanOCR',
help='Checkpoint name or path, default to %(default)r')
parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
parser.add_argument('--flash-attn2',
action='store_true',
default=False,
help='Enable flash_attention_2 when loading the model.')
parser.add_argument('--share',
action='store_true',
default=False,
help='Create a publicly shareable link for the interface.')
parser.add_argument('--inbrowser',
action='store_true',
default=False,
help='Automatically launch the interface in a new tab on the default browser.')
args = parser.parse_args()
return args
def _load_model_processor(args):
# ZeroGPU: Model loads on CPU, uses eager mode
# Automatically moves to GPU within @spaces.GPU decorator
print(f"[INFO] Loading model (ZeroGPU uses eager mode)")
print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}")
model = HunYuanVLForConditionalGeneration.from_pretrained(
args.checkpoint_path,
attn_implementation="eager", # Required for ZeroGPU (starts on CPU)
torch_dtype=torch.bfloat16,
device_map="auto", # Let ZeroGPU manage device placement
)
# Disable gradient checkpointing for faster inference
if hasattr(model, 'gradient_checkpointing_disable'):
model.gradient_checkpointing_disable()
print(f"[INFO] Gradient checkpointing disabled")
# Set to evaluation mode
model.eval()
print(f"[INFO] Model set to eval mode")
processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
print(f"[INFO] Model loaded, device: {next(model.parameters()).device}")
return model, processor
def _parse_text(text):
"""Parse text, handle special formatting"""
# if text is None:
# return text
text = text.replace("", "").replace("", "")
return text
def _remove_image_special(text):
"""Remove image special tokens"""
# if text is None:
# return text
# # Remove image special tokens
# import re
# text = re.sub(r'||
|', '', text)
# return text
return text
def _gc():
"""Garbage collection"""
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def clean_repeated_substrings(text):
"""Clean repeated substrings in text"""
n = len(text)
if n < 2000:
return text
for length in range(2, n // 10 + 1):
candidate = text[-length:]
count = 0
i = n - length
while i >= 0 and text[i:i + length] == candidate:
count += 1
i -= length
if count >= 10:
return text[:n - length * (count - 1)]
return text
def _launch_demo(args, model, processor):
# Track first call
first_call = [True]
# Uses closure to access model and processor
# Duration increased to 120s to avoid timeout during peak hours
@spaces.GPU(duration=120)
def call_local_model(messages):
import time
import sys
start_time = time.time()
if first_call[0]:
print(f"[INFO] ========== First inference call ==========")
first_call[0] = False
else:
print(f"[INFO] ========== Subsequent inference call ==========")
print(f"[DEBUG] ========== Starting inference ==========")
print(f"[DEBUG] Python version: {sys.version}")
print(f"[DEBUG] PyTorch version: {torch.__version__}")
print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}")
print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
# Ensure model is on GPU
model_device = next(model.parameters()).device
print(f"[DEBUG] Model device: {model_device}")
print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")
if str(model_device) == 'cpu':
print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
if torch.cuda.is_available():
move_start = time.time()
model.cuda()
move_time = time.time() - move_start
print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
else:
print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
print(f"[CRITICAL] This may be due to ZeroGPU resource constraints")
else:
print(f"[INFO] Model already on GPU: {model_device}")
messages = [messages]
# Build input using processor
texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
for msg in messages
]
print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s")
image_inputs, video_inputs = process_vision_info(messages)
print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")
# Check image input size
if image_inputs:
for idx, img in enumerate(image_inputs):
if hasattr(img, 'size'):
print(f"[DEBUG] Image {idx} size: {img.size}")
elif isinstance(img, np.ndarray):
print(f"[DEBUG] Image {idx} shape: {img.shape}")
print(f"[DEBUG] Starting processor encoding...")
processor_start = time.time()
inputs = processor(
text=texts,
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")
# Ensure inputs on GPU
to_device_start = time.time()
inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s")
print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s")
print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
print(f"[DEBUG] Input device: {inputs.input_ids.device}")
print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
# Generation
gen_start = time.time()
print(f"[DEBUG] ========== Starting token generation ==========")
# Optimized max_new_tokens for OCR tasks
max_new_tokens = 2048
print(f"[DEBUG] max_new_tokens: {max_new_tokens}")
# Progress callback
token_count = [0]
last_time = [gen_start]
def progress_callback(input_ids, scores, **kwargs):
token_count[0] += 1
current_time = time.time()
if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
elapsed = current_time - gen_start
tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s")
last_time[0] = current_time
return False
with torch.no_grad():
print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s")
# Test forward pass
print(f"[DEBUG] Testing forward pass...")
forward_test_start = time.time()
try:
with torch.cuda.amp.autocast(dtype=torch.bfloat16):
test_outputs = model(**inputs, use_cache=False)
print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
except Exception as e:
print(f"[WARNING] Forward pass test failed: {e}")
print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
generate_call_start = time.time()
try:
# Deterministic generation
generated_ids = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
do_sample=False,
temperature=0
)
print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
except Exception as e:
print(f"[ERROR] Generation failed: {e}")
import traceback
traceback.print_exc()
raise
print(f"[DEBUG] Exited torch.no_grad() context")
gen_time = time.time() - gen_start
print(f"[DEBUG] ========== Generation complete ==========")
print(f"[DEBUG] Generation time: {gen_time:.2f}s")
print(f"[DEBUG] Output shape: {generated_ids.shape}")
# Decode output
if "input_ids" in inputs:
input_ids = inputs.input_ids
else:
input_ids = inputs.inputs
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
]
actual_tokens = len(generated_ids_trimmed[0])
print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
total_time = time.time() - start_time
print(f"[DEBUG] ========== All done ==========")
print(f"[DEBUG] Total time: {total_time:.2f}s")
print(f"[DEBUG] Output length: {len(output_texts[0])} chars")
print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
output_texts[0] = clean_repeated_substrings(output_texts[0])
return output_texts
def create_predict_fn():
def predict(_chatbot, task_history):
nonlocal model, processor
chat_query = _chatbot[-1][0]
query = task_history[-1][0]
if len(chat_query) == 0:
_chatbot.pop()
task_history.pop()
return _chatbot
print('User: ', query)
history_cp = copy.deepcopy(task_history)
full_response = ''
messages = []
content = []
for q, a in history_cp:
if isinstance(q, (tuple, list)):
# Check if URL or local path
img_path = q[0]
if img_path.startswith(('http://', 'https://')):
content.append({'type': 'image', 'image': img_path})
else:
content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
else:
content.append({'type': 'text', 'text': q})
messages.append({'role': 'user', 'content': content})
messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
content = []
messages.pop()
# Call model to get response
response_list = call_local_model(messages)
response = response_list[0] if response_list else ""
_chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
full_response = _parse_text(response)
task_history[-1] = (query, full_response)
print('HunyuanOCR: ' + _parse_text(full_response))
yield _chatbot
return predict
def create_regenerate_fn():
def regenerate(_chatbot, task_history):
nonlocal model, processor
if not task_history:
return _chatbot
item = task_history[-1]
if item[1] is None:
return _chatbot
task_history[-1] = (item[0], None)
chatbot_item = _chatbot.pop(-1)
if chatbot_item[0] is None:
_chatbot[-1] = (_chatbot[-1][0], None)
else:
_chatbot.append((chatbot_item[0], None))
# Use outer predict function
_chatbot_gen = predict(_chatbot, task_history)
for _chatbot in _chatbot_gen:
yield _chatbot
return regenerate
predict = create_predict_fn()
regenerate = create_regenerate_fn()
def add_text(history, task_history, text):
task_text = text
history = history if history is not None else []
task_history = task_history if task_history is not None else []
history = history + [(_parse_text(text), None)]
task_history = task_history + [(task_text, None)]
return history, task_history, ''
def add_file(history, task_history, file):
history = history if history is not None else []
task_history = task_history if task_history is not None else []
history = history + [((file.name,), None)]
task_history = task_history + [((file.name,), None)]
return history, task_history
def download_url_image(url):
"""Download URL image to local temp file"""
try:
# Use URL hash as filename to avoid duplicate downloads
url_hash = hashlib.md5(url.encode()).hexdigest()
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")
# Return cached file if exists
if os.path.exists(temp_path):
return temp_path
# Download image
response = requests.get(url, timeout=10)
response.raise_for_status()
with open(temp_path, 'wb') as f:
f.write(response.content)
return temp_path
except Exception as e:
print(f"Failed to download image: {url}, error: {e}")
return url # Return original URL on failure
def reset_user_input():
return gr.update(value='')
def reset_state(_chatbot, task_history):
task_history.clear()
_chatbot.clear()
_gc()
return []
# Example image paths - local files
EXAMPLE_IMAGES = {
"spotting": "examples/spotting.jpg",
"parsing": "examples/parsing.jpg",
"ie": "examples/ie.jpg",
"vqa": "examples/vqa.jpg",
"translation": "examples/translation.jpg"
}
with gr.Blocks() as demo:
# Header
gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*")
with gr.Column():
# Chat area
chatbot = gr.Chatbot(
label='Chat',
height=600,
bubble_full_width=False,
layout="bubble",
show_copy_button=True,
)
# Input panel
with gr.Group():
query = gr.Textbox(
lines=2,
label='Enter your question',
placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.',
show_label=False
)
with gr.Row():
addfile_btn = gr.UploadButton('Upload Image', file_types=['image'])
submit_btn = gr.Button('Send', variant="primary", scale=3)
regen_btn = gr.Button('Regenerate')
empty_bin = gr.Button('Clear')
# Examples section
gr.Markdown("### Quick Examples - Click to load")
with gr.Row():
example_1_btn = gr.Button("Text Detection")
example_2_btn = gr.Button("Document Parsing")
example_3_btn = gr.Button("Info Extraction")
example_4_btn = gr.Button("Visual Q&A")
example_5_btn = gr.Button("Translation")
task_history = gr.State([])
# Example 1: Text Detection
def load_example_1(history, task_hist):
prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
image_path = EXAMPLE_IMAGES["spotting"]
history = [((image_path,), None)]
task_hist = [((image_path,), None)]
return history, task_hist, prompt
# Example 2: Document Parsing
def load_example_2(history, task_hist):
prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order."
image_path = EXAMPLE_IMAGES["parsing"]
history = [((image_path,), None)]
task_hist = [((image_path,), None)]
return history, task_hist, prompt
# Example 3: Information Extraction
def load_example_3(history, task_hist):
prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
image_path = EXAMPLE_IMAGES["ie"]
history = [((image_path,), None)]
task_hist = [((image_path,), None)]
return history, task_hist, prompt
# Example 4: Visual Q&A
def load_example_4(history, task_hist):
prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
image_path = EXAMPLE_IMAGES["vqa"]
history = [((image_path,), None)]
task_hist = [((image_path,), None)]
return history, task_hist, prompt
# Example 5: Translation
def load_example_5(history, task_hist):
prompt = "Translate all text in this image to English."
image_path = EXAMPLE_IMAGES["translation"]
history = [((image_path,), None)]
task_hist = [((image_path,), None)]
return history, task_hist, prompt
# Bind events
example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])
submit_btn.click(add_text, [chatbot, task_history, query],
[chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
submit_btn.click(reset_user_input, [], [query])
empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
# Feature descriptions
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("""
### Core Features
- **Text Detection & Recognition** - Multi-scene text detection and recognition
- **Document Parsing** - Automatic document structure recognition
- **Information Extraction** - Extract structured data from receipts and forms
- **Visual Q&A** - Text-centric open-ended question answering
- **Translation** - Translate text in images across 14+ languages
""")
with gr.Column(scale=1):
gr.Markdown("""
### Usage Tips
- **Inference** - For production, use VLLM for better performance
- **Image Quality** - Ensure images are clear, well-lit, and not heavily skewed
- **File Size** - Recommended max 10MB per image, JPG/PNG format
- **Use Cases** - OCR, document digitization, receipt recognition, translation
""")
# Footer
gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*")
demo.queue().launch(
share=args.share,
inbrowser=args.inbrowser,
# server_port=args.server_port,
# server_name=args.server_name,
)
def main():
args = _get_args()
model, processor = _load_model_processor(args)
_launch_demo(args, model, processor)
if __name__ == '__main__':
main()