HunyuanOCR-ENGLISH

Running

App Files Files Community

HunyuanOCR-ENGLISH / app.py

victor HF Staff

Revert "Use Gradio Examples component with image thumbnails"

ef880bf 7 days ago

raw

history blame contribute delete

22.1 kB

	import os
	import torch
	import numpy as np
	from PIL import Image
	import spaces
	from transformers import AutoProcessor
	from qwen_vl_utils import process_vision_info
	from transformers import HunYuanVLForConditionalGeneration
	import gradio as gr
	from argparse import ArgumentParser
	import copy
	import requests
	from io import BytesIO
	import tempfile
	import hashlib
	import gc

	# Optimization: Set environment variables
	os.environ['TOKENIZERS_PARALLELISM'] = 'false'
	os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
	# torch._C._jit_set_profiling_executor(False)
	# torch._C._jit_set_profiling_mode(False)





	def _get_args():
	parser = ArgumentParser()

	parser.add_argument('-c',
	'--checkpoint-path',
	type=str,
	default='tencent/HunyuanOCR',
	help='Checkpoint name or path, default to %(default)r')
	parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')

	parser.add_argument('--flash-attn2',
	action='store_true',
	default=False,
	help='Enable flash_attention_2 when loading the model.')
	parser.add_argument('--share',
	action='store_true',
	default=False,
	help='Create a publicly shareable link for the interface.')
	parser.add_argument('--inbrowser',
	action='store_true',
	default=False,
	help='Automatically launch the interface in a new tab on the default browser.')


	args = parser.parse_args()
	return args


	def _load_model_processor(args):
	# ZeroGPU: Model loads on CPU, uses eager mode
	# Automatically moves to GPU within @spaces.GPU decorator
	print(f"[INFO] Loading model (ZeroGPU uses eager mode)")
	print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}")

	model = HunYuanVLForConditionalGeneration.from_pretrained(
	args.checkpoint_path,
	attn_implementation="eager", # Required for ZeroGPU (starts on CPU)
	torch_dtype=torch.bfloat16,
	device_map="auto", # Let ZeroGPU manage device placement
	)

	# Disable gradient checkpointing for faster inference
	if hasattr(model, 'gradient_checkpointing_disable'):
	model.gradient_checkpointing_disable()
	print(f"[INFO] Gradient checkpointing disabled")

	# Set to evaluation mode
	model.eval()
	print(f"[INFO] Model set to eval mode")

	processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)

	print(f"[INFO] Model loaded, device: {next(model.parameters()).device}")
	return model, processor


	def _parse_text(text):
	"""Parse text, handle special formatting"""
	# if text is None:
	# return text
	text = text.replace("<trans>", "").replace("</trans>", "")
	return text


	def _remove_image_special(text):
	"""Remove image special tokens"""
	# if text is None:
	# return text
	# # Remove image special tokens
	# import re
	# text = re.sub(r'<image>\|</image>\|<img>\|</img>', '', text)
	# return text
	return text


	def _gc():
	"""Garbage collection"""
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	def clean_repeated_substrings(text):
	"""Clean repeated substrings in text"""
	n = len(text)
	if n < 2000:
	return text
	for length in range(2, n // 10 + 1):
	candidate = text[-length:]
	count = 0
	i = n - length

	while i >= 0 and text[i:i + length] == candidate:
	count += 1
	i -= length

	if count >= 10:
	return text[:n - length * (count - 1)]

	return text


	def _launch_demo(args, model, processor):
	# Track first call
	first_call = [True]

	# Uses closure to access model and processor
	# Duration increased to 120s to avoid timeout during peak hours
	@spaces.GPU(duration=120)
	def call_local_model(messages):
	import time
	import sys
	start_time = time.time()

	if first_call[0]:
	print(f"[INFO] ========== First inference call ==========")
	first_call[0] = False
	else:
	print(f"[INFO] ========== Subsequent inference call ==========")

	print(f"[DEBUG] ========== Starting inference ==========")
	print(f"[DEBUG] Python version: {sys.version}")
	print(f"[DEBUG] PyTorch version: {torch.__version__}")
	print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
	if torch.cuda.is_available():
	print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
	print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}")
	print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
	print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
	print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")

	# Ensure model is on GPU
	model_device = next(model.parameters()).device
	print(f"[DEBUG] Model device: {model_device}")
	print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")

	if str(model_device) == 'cpu':
	print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
	if torch.cuda.is_available():
	move_start = time.time()
	model.cuda()
	move_time = time.time() - move_start
	print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
	print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
	else:
	print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
	print(f"[CRITICAL] This may be due to ZeroGPU resource constraints")
	else:
	print(f"[INFO] Model already on GPU: {model_device}")

	messages = [messages]

	# Build input using processor
	texts = [
	processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
	for msg in messages
	]
	print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s")

	image_inputs, video_inputs = process_vision_info(messages)
	print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")

	# Check image input size
	if image_inputs:
	for idx, img in enumerate(image_inputs):
	if hasattr(img, 'size'):
	print(f"[DEBUG] Image {idx} size: {img.size}")
	elif isinstance(img, np.ndarray):
	print(f"[DEBUG] Image {idx} shape: {img.shape}")

	print(f"[DEBUG] Starting processor encoding...")
	processor_start = time.time()
	inputs = processor(
	text=texts,
	images=image_inputs,
	videos=video_inputs,
	padding=True,
	return_tensors="pt",
	)
	print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")

	# Ensure inputs on GPU
	to_device_start = time.time()
	inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
	print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s")
	print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s")
	print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
	print(f"[DEBUG] Input device: {inputs.input_ids.device}")
	print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")

	# Generation
	gen_start = time.time()
	print(f"[DEBUG] ========== Starting token generation ==========")

	# Optimized max_new_tokens for OCR tasks
	max_new_tokens = 2048
	print(f"[DEBUG] max_new_tokens: {max_new_tokens}")

	# Progress callback
	token_count = [0]
	last_time = [gen_start]

	def progress_callback(input_ids, scores, **kwargs):
	token_count[0] += 1
	current_time = time.time()
	if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
	elapsed = current_time - gen_start
	tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
	print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s")
	last_time[0] = current_time
	return False

	with torch.no_grad():
	print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s")

	# Test forward pass
	print(f"[DEBUG] Testing forward pass...")
	forward_test_start = time.time()
	try:
	with torch.cuda.amp.autocast(dtype=torch.bfloat16):
	test_outputs = model(**inputs, use_cache=False)
	print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
	except Exception as e:
	print(f"[WARNING] Forward pass test failed: {e}")

	print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
	generate_call_start = time.time()

	try:
	# Deterministic generation
	generated_ids = model.generate(
	**inputs,
	max_new_tokens=max_new_tokens,
	do_sample=False,
	temperature=0
	)
	print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
	except Exception as e:
	print(f"[ERROR] Generation failed: {e}")
	import traceback
	traceback.print_exc()
	raise

	print(f"[DEBUG] Exited torch.no_grad() context")

	gen_time = time.time() - gen_start
	print(f"[DEBUG] ========== Generation complete ==========")
	print(f"[DEBUG] Generation time: {gen_time:.2f}s")
	print(f"[DEBUG] Output shape: {generated_ids.shape}")

	# Decode output
	if "input_ids" in inputs:
	input_ids = inputs.input_ids
	else:
	input_ids = inputs.inputs

	generated_ids_trimmed = [
	out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
	]

	actual_tokens = len(generated_ids_trimmed[0])
	print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
	print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")

	output_texts = processor.batch_decode(
	generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
	)


	total_time = time.time() - start_time
	print(f"[DEBUG] ========== All done ==========")
	print(f"[DEBUG] Total time: {total_time:.2f}s")
	print(f"[DEBUG] Output length: {len(output_texts[0])} chars")
	print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
	output_texts[0] = clean_repeated_substrings(output_texts[0])
	return output_texts


	def create_predict_fn():

	def predict(_chatbot, task_history):
	nonlocal model, processor
	chat_query = _chatbot[-1][0]
	query = task_history[-1][0]
	if len(chat_query) == 0:
	_chatbot.pop()
	task_history.pop()
	return _chatbot
	print('User: ', query)
	history_cp = copy.deepcopy(task_history)
	full_response = ''
	messages = []
	content = []
	for q, a in history_cp:
	if isinstance(q, (tuple, list)):
	# Check if URL or local path
	img_path = q[0]
	if img_path.startswith(('http://', 'https://')):
	content.append({'type': 'image', 'image': img_path})
	else:
	content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
	else:
	content.append({'type': 'text', 'text': q})
	messages.append({'role': 'user', 'content': content})
	messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
	content = []
	messages.pop()

	# Call model to get response
	response_list = call_local_model(messages)
	response = response_list[0] if response_list else ""

	_chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
	full_response = _parse_text(response)

	task_history[-1] = (query, full_response)
	print('HunyuanOCR: ' + _parse_text(full_response))
	yield _chatbot

	return predict

	def create_regenerate_fn():

	def regenerate(_chatbot, task_history):
	nonlocal model, processor
	if not task_history:
	return _chatbot
	item = task_history[-1]
	if item[1] is None:
	return _chatbot
	task_history[-1] = (item[0], None)
	chatbot_item = _chatbot.pop(-1)
	if chatbot_item[0] is None:
	_chatbot[-1] = (_chatbot[-1][0], None)
	else:
	_chatbot.append((chatbot_item[0], None))
	# Use outer predict function
	_chatbot_gen = predict(_chatbot, task_history)
	for _chatbot in _chatbot_gen:
	yield _chatbot

	return regenerate

	predict = create_predict_fn()
	regenerate = create_regenerate_fn()

	def add_text(history, task_history, text):
	task_text = text
	history = history if history is not None else []
	task_history = task_history if task_history is not None else []
	history = history + [(_parse_text(text), None)]
	task_history = task_history + [(task_text, None)]
	return history, task_history, ''

	def add_file(history, task_history, file):
	history = history if history is not None else []
	task_history = task_history if task_history is not None else []
	history = history + [((file.name,), None)]
	task_history = task_history + [((file.name,), None)]
	return history, task_history

	def download_url_image(url):
	"""Download URL image to local temp file"""
	try:
	# Use URL hash as filename to avoid duplicate downloads
	url_hash = hashlib.md5(url.encode()).hexdigest()
	temp_dir = tempfile.gettempdir()
	temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")

	# Return cached file if exists
	if os.path.exists(temp_path):
	return temp_path

	# Download image
	response = requests.get(url, timeout=10)
	response.raise_for_status()
	with open(temp_path, 'wb') as f:
	f.write(response.content)
	return temp_path
	except Exception as e:
	print(f"Failed to download image: {url}, error: {e}")
	return url # Return original URL on failure

	def reset_user_input():
	return gr.update(value='')

	def reset_state(_chatbot, task_history):
	task_history.clear()
	_chatbot.clear()
	_gc()
	return []

	# Example image paths - local files
	EXAMPLE_IMAGES = {
	"spotting": "examples/spotting.jpg",
	"parsing": "examples/parsing.jpg",
	"ie": "examples/ie.jpg",
	"vqa": "examples/vqa.jpg",
	"translation": "examples/translation.jpg"
	}

	with gr.Blocks() as demo:
	# Header
	gr.Markdown("# HunyuanOCR\nPowered by Tencent Hunyuan Team")

	with gr.Column():
	# Chat area
	chatbot = gr.Chatbot(
	label='Chat',
	height=600,
	bubble_full_width=False,
	layout="bubble",
	show_copy_button=True,
	)

	# Input panel
	with gr.Group():
	query = gr.Textbox(
	lines=2,
	label='Enter your question',
	placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.',
	show_label=False
	)

	with gr.Row():
	addfile_btn = gr.UploadButton('Upload Image', file_types=['image'])
	submit_btn = gr.Button('Send', variant="primary", scale=3)
	regen_btn = gr.Button('Regenerate')
	empty_bin = gr.Button('Clear')

	# Examples section
	gr.Markdown("### Quick Examples - Click to load")

	with gr.Row():
	example_1_btn = gr.Button("Text Detection")
	example_2_btn = gr.Button("Document Parsing")
	example_3_btn = gr.Button("Info Extraction")
	example_4_btn = gr.Button("Visual Q&A")
	example_5_btn = gr.Button("Translation")

	task_history = gr.State([])


	# Example 1: Text Detection
	def load_example_1(history, task_hist):
	prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
	image_path = EXAMPLE_IMAGES["spotting"]
	history = [((image_path,), None)]
	task_hist = [((image_path,), None)]
	return history, task_hist, prompt

	# Example 2: Document Parsing
	def load_example_2(history, task_hist):
	prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order."
	image_path = EXAMPLE_IMAGES["parsing"]
	history = [((image_path,), None)]
	task_hist = [((image_path,), None)]
	return history, task_hist, prompt

	# Example 3: Information Extraction
	def load_example_3(history, task_hist):
	prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
	image_path = EXAMPLE_IMAGES["ie"]
	history = [((image_path,), None)]
	task_hist = [((image_path,), None)]
	return history, task_hist, prompt

	# Example 4: Visual Q&A
	def load_example_4(history, task_hist):
	prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
	image_path = EXAMPLE_IMAGES["vqa"]
	history = [((image_path,), None)]
	task_hist = [((image_path,), None)]
	return history, task_hist, prompt

	# Example 5: Translation
	def load_example_5(history, task_hist):
	prompt = "Translate all text in this image to English."
	image_path = EXAMPLE_IMAGES["translation"]
	history = [((image_path,), None)]
	task_hist = [((image_path,), None)]
	return history, task_hist, prompt

	# Bind events
	example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
	example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
	example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
	example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
	example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])

	submit_btn.click(add_text, [chatbot, task_history, query],
	[chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
	submit_btn.click(reset_user_input, [], [query])
	empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
	regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
	addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)

	# Feature descriptions
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("""
	### Core Features
	- Text Detection & Recognition - Multi-scene text detection and recognition
	- Document Parsing - Automatic document structure recognition
	- Information Extraction - Extract structured data from receipts and forms
	- Visual Q&A - Text-centric open-ended question answering
	- Translation - Translate text in images across 14+ languages
	""")

	with gr.Column(scale=1):
	gr.Markdown("""
	### Usage Tips
	- Inference - For production, use VLLM for better performance
	- Image Quality - Ensure images are clear, well-lit, and not heavily skewed
	- File Size - Recommended max 10MB per image, JPG/PNG format
	- Use Cases - OCR, document digitization, receipt recognition, translation
	""")

	# Footer
	gr.Markdown("---\n2025 Tencent Hunyuan Team. For research and educational use.")

	demo.queue().launch(
	share=args.share,
	inbrowser=args.inbrowser,
	# server_port=args.server_port,
	# server_name=args.server_name,
	)


	def main():
	args = _get_args()
	model, processor = _load_model_processor(args)
	_launch_demo(args, model, processor)


	if __name__ == '__main__':
	main()