HunyuanOCR-ENGLISH

Running

victor HF Staff commited on 11 days ago

Commit

9532898

1 Parent(s): e220a3a

Rework demo to English with default Gradio theme

- Translate all UI text from Chinese to English
- Remove custom purple gradient CSS, use default Gradio theme
- Add local example images for 5 demo categories
- Update example prompts to English
- Translate all code comments to English

Files changed (7) hide show

README.md +1 -1
app.py +168 -425
examples/ie.jpg +0 -0
examples/parsing.jpg +0 -0
examples/spotting.jpg +0 -0
examples/translation.jpg +0 -0
examples/vqa.jpg +0 -0

README.md CHANGED Viewed

@@ -1,5 +1,5 @@
 ---
-title: Hyocr
 emoji: 🏃
 colorFrom: blue
 colorTo: yellow

 ---
+title: HunyuanOCR-EN
 emoji: 🏃
 colorFrom: blue
 colorTo: yellow

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import numpy as np
 from PIL import Image
 import spaces
 from transformers import AutoProcessor
-from qwen_vl_utils import process_vision_info  # 请确保该模块在你的环境可用
 from transformers import HunYuanVLForConditionalGeneration
 import gradio as gr
 from argparse import ArgumentParser
@@ -15,10 +15,9 @@ import tempfile
 import hashlib
 import gc
-# 关键优化：设置环境变量加速 transformers
-os.environ['TOKENIZERS_PARALLELISM'] = 'false'  # 避免tokenizer警告
 os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
-# 禁用 PyTorch 的 JIT 融合优化（在某些情况下会导致首次运行极慢）
 # torch._C._jit_set_profiling_executor(False)
 # torch._C._jit_set_profiling_mode(False)
@@ -55,35 +54,35 @@ def _get_args():
 def _load_model_processor(args):
-    # ZeroGPU 环境：模型在 CPU 上加载，使用 eager 模式
-    # 在 @spaces.GPU 装饰器内会自动移到 GPU
-    print(f"[INFO] 加载模型（ZeroGPU 环境使用 eager 模式）")
-    print(f"[INFO] 加载时 CUDA available: {torch.cuda.is_available()}")
     model = HunYuanVLForConditionalGeneration.from_pretrained(
         args.checkpoint_path,
-        attn_implementation="eager",  # ZeroGPU 必须用 eager，因为初始在 CPU
         torch_dtype=torch.bfloat16,
-        device_map="auto",  # 改回 auto，让 ZeroGPU 自动管理
     )
-    # 关键：禁用梯度检查点（如果启用会导致极慢）
     if hasattr(model, 'gradient_checkpointing_disable'):
         model.gradient_checkpointing_disable()
-        print(f"[INFO] 梯度检查点已禁用")
-    # 设置为评估模式
     model.eval()
-    print(f"[INFO] 模型设置为评估模式")
     processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
-    print(f"[INFO] 模型加载完成，当前设备: {next(model.parameters()).device}")
     return model, processor
 def _parse_text(text):
-    """解析文本，处理特殊格式"""
     # if text is None:
     #     return text
     text = text.replace("<trans>", "").replace("</trans>", "")
@@ -91,10 +90,10 @@ def _parse_text(text):
 def _remove_image_special(text):
-    """移除图像特殊标记"""
     # if text is None:
     #     return text
-    # # 移除可能的图像特殊标记
     # import re
     # text = re.sub(r'<image>|</image>|<img>|</img>', '', text)
     # return text
@@ -102,7 +101,7 @@ def _remove_image_special(text):
 def _gc():
-    """垃圾回收"""
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
@@ -128,11 +127,11 @@ def clean_repeated_substrings(text):
 def _launch_demo(args, model, processor):
-    # 全局变量用于跟踪是否是首次调用
     first_call = [True]
-    # 关键修复：移除 model 和 processor 参数，使用闭包访问
-    # 增加 duration 到 120 秒，避免高峰期超时
     @spaces.GPU(duration=120)
     def call_local_model(messages):
         import time
@@ -140,12 +139,12 @@ def _launch_demo(args, model, processor):
         start_time = time.time()
         if first_call[0]:
-            print(f"[INFO] ========== 这是首次推理调用 ==========")
             first_call[0] = False
         else:
-            print(f"[INFO] ========== 这是第 N 次推理调用 ==========")
-        print(f"[DEBUG] ========== 开始推理 ==========")
         print(f"[DEBUG] Python version: {sys.version}")
         print(f"[DEBUG] PyTorch version: {torch.__version__}")
         print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
@@ -156,38 +155,38 @@ def _launch_demo(args, model, processor):
             print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
             print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
-        # 关键：检查并确保模型在 GPU 上
         model_device = next(model.parameters()).device
         print(f"[DEBUG] Model device: {model_device}")
         print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")
         if str(model_device) == 'cpu':
-            print(f"[ERROR] 模型在 CPU 上！尝试移动到 GPU...")
             if torch.cuda.is_available():
                 move_start = time.time()
                 model.cuda()
                 move_time = time.time() - move_start
                 print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
-                print(f"[DEBUG] 模型移动到 GPU 耗时: {move_time:.2f}s")
             else:
-                print(f"[CRITICAL] CUDA 不可用！将在 CPU 上运行，速度会很慢！")
-                print(f"[CRITICAL] 这可能是因为 ZeroGPU 资源紧张或超时")
         else:
-            print(f"[INFO] 模型已在 GPU 上: {model_device}")
         messages = [messages]
-        # 使用 processor 构造输入格式
         texts = [
             processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
             for msg in messages
         ]
-        print(f"[DEBUG] 模板构建完成，耗时: {time.time() - start_time:.2f}s")
         image_inputs, video_inputs = process_vision_info(messages)
-        print(f"[DEBUG] 图像处理完成，耗时: {time.time() - start_time:.2f}s")
-        # 检查图像输入大小
         if image_inputs:
             for idx, img in enumerate(image_inputs):
                 if hasattr(img, 'size'):
@@ -195,10 +194,7 @@ def _launch_demo(args, model, processor):
                 elif isinstance(img, np.ndarray):
                     print(f"[DEBUG] Image {idx} shape: {img.shape}")
-        print(f"[DEBUG] 开始 processor 编码输入...")
-        processor_start = time.time()
-        print(f"[DEBUG] 开始 processor 编码输入...")
         processor_start = time.time()
         inputs = processor(
             text=texts,
@@ -207,27 +203,26 @@ def _launch_demo(args, model, processor):
             padding=True,
             return_tensors="pt",
         )
-        print(f"[DEBUG] Processor 编码完成，耗时: {time.time() - processor_start:.2f}s")
-        # 确保输入在 GPU 上
         to_device_start = time.time()
         inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
-        print(f"[DEBUG] 输入移到设备耗时: {time.time() - to_device_start:.2f}s")
-        print(f"[DEBUG] 输入准备完成，总耗时: {time.time() - start_time:.2f}s")
         print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
         print(f"[DEBUG] Input device: {inputs.input_ids.device}")
         print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
-        # 生成
         gen_start = time.time()
-        print(f"[DEBUG] ========== 开始生成 tokens ==========")
-        # 关键优化：根据任务类型动态调整 max_new_tokens
-        # OCR 任务通常不需要 8192 tokens，这会导致不必要的等待
-        max_new_tokens = 2048  # 从 8192 降到 2048，大幅提速
         print(f"[DEBUG] max_new_tokens: {max_new_tokens}")
-        # 添加进度回调
         token_count = [0]
         last_time = [gen_start]
@@ -237,49 +232,49 @@ def _launch_demo(args, model, processor):
             if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
                 elapsed = current_time - gen_start
                 tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
-                print(f"[DEBUG] 已生成 {token_count[0]} tokens, 速度: {tokens_per_sec:.2f} tokens/s, 耗时: {elapsed:.2f}s")
                 last_time[0] = current_time
             return False
         with torch.no_grad():
-            print(f"[DEBUG] 进入 torch.no_grad() 上下文，耗时: {time.time() - start_time:.2f}s")
-            # 先做一次简单的前向传播测试
-            print(f"[DEBUG] 测试前向传播...")
             forward_test_start = time.time()
             try:
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                     test_outputs = model(**inputs, use_cache=False)
-                print(f"[DEBUG] 前向传播测试成功，耗时: {time.time() - forward_test_start:.2f}s")
             except Exception as e:
-                print(f"[WARNING] 前向传播测试失败: {e}")
-            print(f"[DEBUG] 开始调用 model.generate()... (当前耗时: {time.time() - start_time:.2f}s)")
             generate_call_start = time.time()
             try:
-                # 关键：添加更激进的生成参数，强制早停
                 generated_ids = model.generate(
                     **inputs,
                     max_new_tokens=max_new_tokens,
                     do_sample=False,
                     temperature=0
                 )
-                print(f"[DEBUG] model.generate() 返回，耗时: {time.time() - generate_call_start:.2f}s")
             except Exception as e:
-                print(f"[ERROR] 生成失败: {e}")
                 import traceback
                 traceback.print_exc()
                 raise
-        print(f"[DEBUG] 退出 torch.no_grad() 上下文")
         gen_time = time.time() - gen_start
-        print(f"[DEBUG] ========== 生成完成 ==========")
-        print(f"[DEBUG] 生成耗时: {gen_time:.2f}s")
         print(f"[DEBUG] Output shape: {generated_ids.shape}")
-        # 解码输出
         if "input_ids" in inputs:
             input_ids = inputs.input_ids
         else:
@@ -290,8 +285,8 @@ def _launch_demo(args, model, processor):
         ]
         actual_tokens = len(generated_ids_trimmed[0])
-        print(f"[DEBUG] 实际生成 token 数: {actual_tokens}")
-        print(f"[DEBUG] 每 token 耗时: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")
         output_texts = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
@@ -299,10 +294,10 @@ def _launch_demo(args, model, processor):
         total_time = time.time() - start_time
-        print(f"[DEBUG] ========== 全部完成 ==========")
-        print(f"[DEBUG] 总耗时: {total_time:.2f}s")
-        print(f"[DEBUG] 输出长度: {len(output_texts[0])} 字符")
-        print(f"[DEBUG] 输出预览: {output_texts[0][:100]}...")
         output_texts[0] = clean_repeated_substrings(output_texts[0])
         return output_texts
@@ -324,7 +319,7 @@ def _launch_demo(args, model, processor):
             content = []
             for q, a in history_cp:
                 if isinstance(q, (tuple, list)):
-                    # 判断是URL还是本地路径
                     img_path = q[0]
                     if img_path.startswith(('http://', 'https://')):
                         content.append({'type': 'image', 'image': img_path})
@@ -337,7 +332,7 @@ def _launch_demo(args, model, processor):
                     content = []
             messages.pop()
-            # 调用模型获取响应（已修改：不再传递 model 和 processor）
             response_list = call_local_model(messages)
             response = response_list[0] if response_list else ""
@@ -365,7 +360,7 @@ def _launch_demo(args, model, processor):
                 _chatbot[-1] = (_chatbot[-1][0], None)
             else:
                 _chatbot.append((chatbot_item[0], None))
-            # 使用外层的predict函数
             _chatbot_gen = predict(_chatbot, task_history)
             for _chatbot in _chatbot_gen:
                 yield _chatbot
@@ -391,26 +386,26 @@ def _launch_demo(args, model, processor):
         return history, task_history
     def download_url_image(url):
-        """下载 URL 图片到本地临时文件"""
         try:
-            # 使用 URL 的哈希值作为文件名，避免重复下载
             url_hash = hashlib.md5(url.encode()).hexdigest()
             temp_dir = tempfile.gettempdir()
             temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")
-            # 如果文件已存在，直接返回
             if os.path.exists(temp_path):
                 return temp_path
-            # 下载图片
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             with open(temp_path, 'wb') as f:
                 f.write(response.content)
             return temp_path
         except Exception as e:
-            print(f"下载图片失败: {url}, 错误: {e}")
-            return url  # 失败时返回原 URL
     def reset_user_input():
         return gr.update(value='')
@@ -421,336 +416,98 @@ def _launch_demo(args, model, processor):
         _gc()
         return []
-    # 示例图片路径配置 - 请替换为实际图片路径
     EXAMPLE_IMAGES = {
-        "spotting": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/23cc43af9376b948f3febaf4ce854a8a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523817%3B1794627877&q-key-time=1763523817%3B1794627877&q-header-list=host&q-url-param-list=&q-signature=8ebd6a9d3ed7eba73bb783c337349db9c29972e2",   # TODO: 替换为场景文字示例图片路径
-        "parsing": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/c4997ebd1be9f7c3e002fabba8b46cb7.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=d2cd12be4c7902821c8c82203e4642624046911a",
-        "ie": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/7c67c0f78e4423d51644a325da1f8e85.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=803648f3253706f654faf1423869fd9e00e7056e",
-        "vqa": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/fea0865d1c70c53aaa2ab91cd0e787f5.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=a92b94e298a11aea130d730d3b16ee761acc3f4c",
-        "translation": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/d1af99d35e9db9e820ebebb5bc68993a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763967603%3B1795071663&q-key-time=1763967603%3B1795071663&q-header-list=host&q-url-param-list=&q-signature=a57080c0b3d4c76ea74b88c6291f9004241c9d49",
-        # "spotting": "examples/spotting.jpg",
-        # "parsing": "examples/parsing.jpg",
-        # "ie": "examples/ie.jpg",
-        # "vqa": "examples/vqa.jpg",
-        # "translation": "examples/translation.jpg"
     }
-    with gr.Blocks(css="""
-        body {
-            background: #f5f7fa;
-        }
-        .gradio-container {
-            max-width: 100% !important;
-            padding: 0 40px !important;
-        }
-        .header-section {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
-            padding: 30px 0;
-            margin: -20px -40px 30px -40px;
-            box-shadow: 0 2px 10px rgba(0,0,0,0.1);
-        }
-        .header-content {
-            max-width: 1600px;
-            margin: 0 auto;
-            padding: 0 40px;
-            display: flex;
-            align-items: center;
-            gap: 20px;
-        }
-        .header-logo {
-            height: 60px;
-        }
-        .header-text h1 {
-            color: white;
-            font-size: 32px;
-            font-weight: bold;
-            margin: 0 0 5px 0;
-        }
-        .header-text p {
-            color: rgba(255,255,255,0.9);
-            margin: 0;
-            font-size: 14px;
-        }
-        .main-container {
-            max-width: 1800px;
-            margin: 0 auto;
-        }
-        .chatbot {
-            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
-            border-radius: 12px !important;
-            border: 1px solid #e5e7eb !important;
-            background: white !important;
-        }
-        .input-panel {
-            background: white;
-            padding: 20px;
-            border-radius: 12px;
-            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
-            border: 1px solid #e5e7eb;
-        }
-        .input-box textarea {
-            border: 2px solid #e5e7eb !important;
-            border-radius: 8px !important;
-            font-size: 14px !important;
-        }
-        .input-box textarea:focus {
-            border-color: #667eea !important;
-        }
-        .btn-primary {
-            background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
-            border: none !important;
-            color: white !important;
-            font-weight: 500 !important;
-            padding: 10px 24px !important;
-            font-size: 14px !important;
-        }
-        .btn-primary:hover {
-            transform: translateY(-1px) !important;
-            box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
-        }
-        .btn-secondary {
-            background: white !important;
-            border: 2px solid #667eea !important;
-            color: #667eea !important;
-            padding: 8px 20px !important;
-            font-size: 14px !important;
-        }
-        .btn-secondary:hover {
-            background: #f0f4ff !important;
-        }
-        .example-grid {
-            display: grid;
-            grid-template-columns: repeat(4, 1fr);
-            gap: 20px;
-            margin-top: 30px;
-        }
-        .example-card {
-            background: white;
-            border-radius: 12px;
-            overflow: hidden;
-            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
-            border: 1px solid #e5e7eb;
-            transition: all 0.3s ease;
-        }
-        .example-card:hover {
-            transform: translateY(-4px);
-            box-shadow: 0 8px 20px rgba(102, 126, 234, 0.15);
-            border-color: #667eea;
-        }
-        .example-image-wrapper {
-            width: 100%;
-            height: 180px;
-            overflow: hidden;
-            background: #f5f7fa;
-        }
-        .example-image-wrapper img {
-            width: 100%;
-            height: 100%;
-            object-fit: cover;
-        }
-        .example-btn {
-            width: 100% !important;
-            white-space: pre-wrap !important;
-            text-align: left !important;
-            padding: 16px !important;
-            background: white !important;
-            border: none !important;
-            border-top: 1px solid #e5e7eb !important;
-            color: #1f2937 !important;
-            font-size: 14px !important;
-            line-height: 1.6 !important;
-            transition: all 0.3s ease !important;
-            font-weight: 500 !important;
-        }
-        .example-btn:hover {
-            background: #f9fafb !important;
-            color: #667eea !important;
-        }
-        .feature-section {
-            background: white;
-            padding: 24px;
-            border-radius: 12px;
-            margin-top: 30px;
-            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
-            border: 1px solid #e5e7eb;
-        }
-        .section-title {
-            font-size: 18px;
-            font-weight: 600;
-            color: #1f2937;
-            margin-bottom: 20px;
-            padding-bottom: 12px;
-            border-bottom: 2px solid #e5e7eb;
-        }
-    """) as demo:
-        # 顶部导航栏
-        gr.HTML("""
-        <div class="header-section">
-            <div class="header-content">
-                <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/6ef6928b21b323b2b00115f86a779d8f.png?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763450355%3B1794554415&q-key-time=1763450355%3B1794554415&q-header-list=host&q-url-param-list=&q-signature=41328696dc34571324aa18c791c1196192e729c6" class="header-logo"/>
-                <div class="header-text">
-                    <h1>HunyuanOCR</h1>
-                    <p>Powered by Tencent Hunyuan Team</p>
-                </div>
-            </div>
-        </div>
-        """)
-        with gr.Column(elem_classes=["main-container"]):
-            # 对话区域 - 全宽
             chatbot = gr.Chatbot(
-                label='💬 对话窗口',
                 height=600,
                 bubble_full_width=False,
                 layout="bubble",
                 show_copy_button=True,
-                avatar_images=(None, "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/6ef6928b21b323b2b00115f86a779d8f.png?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763450355%3B1794554415&q-key-time=1763450355%3B1794554415&q-header-list=host&q-url-param-list=&q-signature=41328696dc34571324aa18c791c1196192e729c6"),
-                elem_classes=["chatbot"]
             )
-            # 输入控制面板 - 全宽
-            with gr.Group(elem_classes=["input-panel"]):
                 query = gr.Textbox(
-                    lines=2,
-                    label='💭 输入您的问题',
-                    placeholder='请先上传图片，然后输入问题。例如：检测并识别图片中的文字，将文本坐标格式化输出。',
-                    elem_classes=["input-box"],
                     show_label=False
                 )
                 with gr.Row():
-                    addfile_btn = gr.UploadButton('📁 上传图片', file_types=['image'], elem_classes=["btn-secondary"])
-                    submit_btn = gr.Button('🚀 发送消息', variant="primary", elem_classes=["btn-primary"], scale=3)
-                    regen_btn = gr.Button('🔄 重新生成', elem_classes=["btn-secondary"])
-                    empty_bin = gr.Button('🗑️ 清空对话', elem_classes=["btn-secondary"])
-            # 示例区域 - 5列网格布局
-            gr.HTML('<div class="section-title">📚 快速体验示例 - 点击下方卡片快速加载</div>')
             with gr.Row():
-                # 示例1：spotting
-                with gr.Column(scale=1):
-                    with gr.Group(elem_classes=["example-card"]):
-                        gr.HTML("""
-                        <div class="example-image-wrapper">
-                            <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/23cc43af9376b948f3febaf4ce854a8a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523817%3B1794627877&q-key-time=1763523817%3B1794627877&q-header-list=host&q-url-param-list=&q-signature=8ebd6a9d3ed7eba73bb783c337349db9c29972e2" alt="文字检测识别"/>
-                        </div>
-                        """)
-                        example_1_btn = gr.Button("🔍 文字检测和识别", elem_classes=["example-btn"])
-                # 示例2：parsing
-                with gr.Column(scale=1):
-                    with gr.Group(elem_classes=["example-card"]):
-                        gr.HTML("""
-                        <div class="example-image-wrapper">
-                            <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/c4997ebd1be9f7c3e002fabba8b46cb7.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=d2cd12be4c7902821c8c82203e4642624046911a" alt="文档解析"/>
-                        </div>
-                        """)
-                        example_2_btn = gr.Button("📋 文档解析", elem_classes=["example-btn"])
-                # 示例3：ie
-                with gr.Column(scale=1):
-                    with gr.Group(elem_classes=["example-card"]):
-                        gr.HTML("""
-                        <div class="example-image-wrapper">
-                            <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/7c67c0f78e4423d51644a325da1f8e85.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=803648f3253706f654faf1423869fd9e00e7056e" alt="信息抽取"/>
-                        </div>
-                        """)
-                        example_3_btn = gr.Button("🎯 信息抽取", elem_classes=["example-btn"])
-                # 示例4：VQA
-                with gr.Column(scale=1):
-                    with gr.Group(elem_classes=["example-card"]):
-                        gr.HTML("""
-                        <div class="example-image-wrapper">
-                            <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/fea0865d1c70c53aaa2ab91cd0e787f5.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=a92b94e298a11aea130d730d3b16ee761acc3f4c" alt="视觉问答"/>
-                        </div>
-                        """)
-                        example_4_btn = gr.Button("💬 视觉问答", elem_classes=["example-btn"])
-                # 示例5：translation
-                with gr.Column(scale=1):
-                    with gr.Group(elem_classes=["example-card"]):
-                        gr.HTML("""
-                        <div class="example-image-wrapper">
-                            <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/d1af99d35e9db9e820ebebb5bc68993a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763967603%3B1795071663&q-key-time=1763967603%3B1795071663&q-header-list=host&q-url-param-list=&q-signature=a57080c0b3d4c76ea74b88c6291f9004241c9d49" alt="图片翻译"/>
-                        </div>
-                        """)
-                        example_5_btn = gr.Button("🌐 图片翻译", elem_classes=["example-btn"])
         task_history = gr.State([])
-        # 示例1：文档识别
         def load_example_1(history, task_hist):
-            prompt = "检测并识别图片中的文字，将文本坐标格式化输出。"
-            image_url = EXAMPLE_IMAGES["spotting"]
-            # 下载 URL 图片到本地
-            image_path = download_url_image(image_url)
-            # 清空对话历史
-            history = []
-            task_hist = []
-            history = history + [((image_path,), None)]
-            task_hist = task_hist + [((image_path,), None)]
             return history, task_hist, prompt
-        # 示例2：场景文字
         def load_example_2(history, task_hist):
-            prompt = "提取文档图片中正文的所有信息用markdown 格式表示，其中页眉、页脚部分忽略，表格用html 格式表达，文档中公式用latex 格式表示，按照阅读顺序组织进行解析。"
-            image_url = EXAMPLE_IMAGES["parsing"]
-            # 下载 URL 图片到本地
-            image_path = download_url_image(image_url)
-            # 清空对话历史
-            history = []
-            task_hist = []
-            history = history + [((image_path,), None)]
-            task_hist = task_hist + [((image_path,), None)]
             return history, task_hist, prompt
-        # 示例3：表格提取
         def load_example_3(history, task_hist):
-            prompt = "提取图片中的：['单价', '上车时间','发票号码', '省前缀', '总金额', '发票代码', '下车时间', '里程数'] 的字段内容，并且按照JSON格式返回。"
-            image_url = EXAMPLE_IMAGES["ie"]
-            # 下载 URL 图片到本地
-            image_path = download_url_image(image_url)
-            # 清空对话历史
-            history = []
-            task_hist = []
-            history = history + [((image_path,), None)]
-            task_hist = task_hist + [((image_path,), None)]
             return history, task_hist, prompt
-        # 示例4：手写体
         def load_example_4(history, task_hist):
-            prompt = "What is the highest life expectancy at birth of male?"
-            image_url = EXAMPLE_IMAGES["vqa"]
-            # 下载 URL 图片到本地
-            image_path = download_url_image(image_url)
-            # 清空对话历史
-            history = []
-            task_hist = []
-            history = history + [((image_path,), None)]
-            task_hist = task_hist + [((image_path,), None)]
             return history, task_hist, prompt
-        # 示例5：翻译
         def load_example_5(history, task_hist):
-            prompt = "将图中文字翻译为中文。"
-            image_url = EXAMPLE_IMAGES["translation"]
-            # 下载 URL 图片到本地
-            image_path = download_url_image(image_url)
-            # 清空对话历史
-            history = []
-            task_hist = []
-            history = history + [((image_path,), None)]
-            task_hist = task_hist + [((image_path,), None)]
             return history, task_hist, prompt
-        # 绑定事件
         example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
         example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
         example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
@@ -764,43 +521,29 @@ def _launch_demo(args, model, processor):
         regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
         addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
-        # 功能说明区域
         with gr.Row():
             with gr.Column(scale=1):
-                gr.HTML("""
-                <div class="feature-section">
-                    <div class="section-title">✨ 核心功能</div>
-                    <ul style="line-height: 2; color: #4b5563; font-size: 14px; margin: 0; padding-left: 20px;">
-                        <li><strong>🎯 高精度文字检测识别</strong> - 支持多场景文字检测与识别</li>
-                        <li><strong>📐 智能文档解析</strong> - 自动识别文档结构，支持多粒度文档解析</li>
-                        <li><strong>📋 信息提取</strong> - 支持30+高频卡证票据识别和结构化输出</li>
-                        <li><strong>✏️ 视觉问答</strong> - 支持以文本为中心的开放式问答</li>
-                        <li><strong>🌍 跨语言翻译</strong> - 支持中英互译及14+语种译为中英文</li>
-                    </ul>
-                </div>
                 """)
             with gr.Column(scale=1):
-                gr.HTML("""
-                <div class="feature-section">
-                    <div class="section-title">💡 使用建议</div>
-                    <ul style="line-height: 2; color: #4b5563; font-size: 14px; margin: 0; padding-left: 20px;">
-                        <li><strong>推理框架</strong> - 正式生产推荐使用VLLM，以获取更好的推理性能和精度</li>
-                        <li><strong>拍摄角度</strong> - 确保图片清晰，光线充足，分辨率适中，避免严重倾斜、遮挡或反光，正面拍摄效果最佳</li>
-                        <li><strong>文件大小</strong> - 建议单张图片不超过 10MB，支持 JPG/PNG 格式</li>
-                        <li><strong>使用场景</strong> - 适用于文字检测识别、文档数字化、票据识别、信息提取、文字图片翻译等</li>
-                        <li><strong>合规使用</strong> - 仅供学习研究，请遵守法律法规，尊重隐私权</li>
-                    </ul>
-                </div>
                 """)
-        # 底部版权信息
-        gr.HTML("""
-        <div style="text-align: center; color: #9ca3af; font-size: 13px; margin-top: 40px; padding: 20px; border-top: 1px solid #e5e7eb;">
-            <p style="margin: 0;">© 2025 Tencent Hunyuan Team. All rights reserved.</p>
-            <p style="margin: 5px 0 0 0;">本系统基于 HunyuanOCR 构建 | 仅供学习研究使用</p>
-        </div>
-        """)
     demo.queue().launch(
         share=args.share,

 from PIL import Image
 import spaces
 from transformers import AutoProcessor
+from qwen_vl_utils import process_vision_info
 from transformers import HunYuanVLForConditionalGeneration
 import gradio as gr
 from argparse import ArgumentParser
 import hashlib
 import gc
+# Optimization: Set environment variables
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
 # torch._C._jit_set_profiling_executor(False)
 # torch._C._jit_set_profiling_mode(False)
 def _load_model_processor(args):
+    # ZeroGPU: Model loads on CPU, uses eager mode
+    # Automatically moves to GPU within @spaces.GPU decorator
+    print(f"[INFO] Loading model (ZeroGPU uses eager mode)")
+    print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}")
     model = HunYuanVLForConditionalGeneration.from_pretrained(
         args.checkpoint_path,
+        attn_implementation="eager",  # Required for ZeroGPU (starts on CPU)
         torch_dtype=torch.bfloat16,
+        device_map="auto",  # Let ZeroGPU manage device placement
     )
+    # Disable gradient checkpointing for faster inference
     if hasattr(model, 'gradient_checkpointing_disable'):
         model.gradient_checkpointing_disable()
+        print(f"[INFO] Gradient checkpointing disabled")
+    # Set to evaluation mode
     model.eval()
+    print(f"[INFO] Model set to eval mode")
     processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
+    print(f"[INFO] Model loaded, device: {next(model.parameters()).device}")
     return model, processor
 def _parse_text(text):
+    """Parse text, handle special formatting"""
     # if text is None:
     #     return text
     text = text.replace("<trans>", "").replace("</trans>", "")
 def _remove_image_special(text):
+    """Remove image special tokens"""
     # if text is None:
     #     return text
+    # # Remove image special tokens
     # import re
     # text = re.sub(r'<image>|</image>|<img>|</img>', '', text)
     # return text
 def _gc():
+    """Garbage collection"""
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
 def _launch_demo(args, model, processor):
+    # Track first call
     first_call = [True]
+    # Uses closure to access model and processor
+    # Duration increased to 120s to avoid timeout during peak hours
     @spaces.GPU(duration=120)
     def call_local_model(messages):
         import time
         start_time = time.time()
         if first_call[0]:
+            print(f"[INFO] ========== First inference call ==========")
             first_call[0] = False
         else:
+            print(f"[INFO] ========== Subsequent inference call ==========")
+        print(f"[DEBUG] ========== Starting inference ==========")
         print(f"[DEBUG] Python version: {sys.version}")
         print(f"[DEBUG] PyTorch version: {torch.__version__}")
         print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
             print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
             print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
+        # Ensure model is on GPU
         model_device = next(model.parameters()).device
         print(f"[DEBUG] Model device: {model_device}")
         print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")
         if str(model_device) == 'cpu':
+            print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
             if torch.cuda.is_available():
                 move_start = time.time()
                 model.cuda()
                 move_time = time.time() - move_start
                 print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
+                print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
             else:
+                print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
+                print(f"[CRITICAL] This may be due to ZeroGPU resource constraints")
         else:
+            print(f"[INFO] Model already on GPU: {model_device}")
         messages = [messages]
+        # Build input using processor
         texts = [
             processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
             for msg in messages
         ]
+        print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s")
         image_inputs, video_inputs = process_vision_info(messages)
+        print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")
+        # Check image input size
         if image_inputs:
             for idx, img in enumerate(image_inputs):
                 if hasattr(img, 'size'):
                 elif isinstance(img, np.ndarray):
                     print(f"[DEBUG] Image {idx} shape: {img.shape}")
+        print(f"[DEBUG] Starting processor encoding...")
         processor_start = time.time()
         inputs = processor(
             text=texts,
             padding=True,
             return_tensors="pt",
         )
+        print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")
+        # Ensure inputs on GPU
         to_device_start = time.time()
         inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
+        print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s")
+        print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s")
         print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
         print(f"[DEBUG] Input device: {inputs.input_ids.device}")
         print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
+        # Generation
         gen_start = time.time()
+        print(f"[DEBUG] ========== Starting token generation ==========")
+        # Optimized max_new_tokens for OCR tasks
+        max_new_tokens = 2048
         print(f"[DEBUG] max_new_tokens: {max_new_tokens}")
+        # Progress callback
         token_count = [0]
         last_time = [gen_start]
             if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
                 elapsed = current_time - gen_start
                 tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
+                print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s")
                 last_time[0] = current_time
             return False
         with torch.no_grad():
+            print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s")
+            # Test forward pass
+            print(f"[DEBUG] Testing forward pass...")
             forward_test_start = time.time()
             try:
                 with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                     test_outputs = model(**inputs, use_cache=False)
+                print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
             except Exception as e:
+                print(f"[WARNING] Forward pass test failed: {e}")
+            print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
             generate_call_start = time.time()
             try:
+                # Deterministic generation
                 generated_ids = model.generate(
                     **inputs,
                     max_new_tokens=max_new_tokens,
                     do_sample=False,
                     temperature=0
                 )
+                print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
             except Exception as e:
+                print(f"[ERROR] Generation failed: {e}")
                 import traceback
                 traceback.print_exc()
                 raise
+        print(f"[DEBUG] Exited torch.no_grad() context")
         gen_time = time.time() - gen_start
+        print(f"[DEBUG] ========== Generation complete ==========")
+        print(f"[DEBUG] Generation time: {gen_time:.2f}s")
         print(f"[DEBUG] Output shape: {generated_ids.shape}")
+        # Decode output
         if "input_ids" in inputs:
             input_ids = inputs.input_ids
         else:
         ]
         actual_tokens = len(generated_ids_trimmed[0])
+        print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
+        print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")
         output_texts = processor.batch_decode(
             generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
         total_time = time.time() - start_time
+        print(f"[DEBUG] ========== All done ==========")
+        print(f"[DEBUG] Total time: {total_time:.2f}s")
+        print(f"[DEBUG] Output length: {len(output_texts[0])} chars")
+        print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
         output_texts[0] = clean_repeated_substrings(output_texts[0])
         return output_texts
             content = []
             for q, a in history_cp:
                 if isinstance(q, (tuple, list)):
+                    # Check if URL or local path
                     img_path = q[0]
                     if img_path.startswith(('http://', 'https://')):
                         content.append({'type': 'image', 'image': img_path})
                     content = []
             messages.pop()
+            # Call model to get response
             response_list = call_local_model(messages)
             response = response_list[0] if response_list else ""
                 _chatbot[-1] = (_chatbot[-1][0], None)
             else:
                 _chatbot.append((chatbot_item[0], None))
+            # Use outer predict function
             _chatbot_gen = predict(_chatbot, task_history)
             for _chatbot in _chatbot_gen:
                 yield _chatbot
         return history, task_history
     def download_url_image(url):
+        """Download URL image to local temp file"""
         try:
+            # Use URL hash as filename to avoid duplicate downloads
             url_hash = hashlib.md5(url.encode()).hexdigest()
             temp_dir = tempfile.gettempdir()
             temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")
+            # Return cached file if exists
             if os.path.exists(temp_path):
                 return temp_path
+            # Download image
             response = requests.get(url, timeout=10)
             response.raise_for_status()
             with open(temp_path, 'wb') as f:
                 f.write(response.content)
             return temp_path
         except Exception as e:
+            print(f"Failed to download image: {url}, error: {e}")
+            return url  # Return original URL on failure
     def reset_user_input():
         return gr.update(value='')
         _gc()
         return []
+    # Example image paths - local files
     EXAMPLE_IMAGES = {
+        "spotting": "examples/spotting.jpg",
+        "parsing": "examples/parsing.jpg",
+        "ie": "examples/ie.jpg",
+        "vqa": "examples/vqa.jpg",
+        "translation": "examples/translation.jpg"
     }
+    with gr.Blocks() as demo:
+        # Header
+        gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*")
+        with gr.Column():
+            # Chat area
             chatbot = gr.Chatbot(
+                label='Chat',
                 height=600,
                 bubble_full_width=False,
                 layout="bubble",
                 show_copy_button=True,
             )
+            # Input panel
+            with gr.Group():
                 query = gr.Textbox(
+                    lines=2,
+                    label='Enter your question',
+                    placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.',
                     show_label=False
                 )
                 with gr.Row():
+                    addfile_btn = gr.UploadButton('Upload Image', file_types=['image'])
+                    submit_btn = gr.Button('Send', variant="primary", scale=3)
+                    regen_btn = gr.Button('Regenerate')
+                    empty_bin = gr.Button('Clear')
+            # Examples section
+            gr.Markdown("### Quick Examples - Click to load")
             with gr.Row():
+                example_1_btn = gr.Button("Text Detection")
+                example_2_btn = gr.Button("Document Parsing")
+                example_3_btn = gr.Button("Info Extraction")
+                example_4_btn = gr.Button("Visual Q&A")
+                example_5_btn = gr.Button("Translation")
         task_history = gr.State([])
+        # Example 1: Text Detection
         def load_example_1(history, task_hist):
+            prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
+            image_path = EXAMPLE_IMAGES["spotting"]
+            history = [((image_path,), None)]
+            task_hist = [((image_path,), None)]
             return history, task_hist, prompt
+        # Example 2: Document Parsing
         def load_example_2(history, task_hist):
+            prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order."
+            image_path = EXAMPLE_IMAGES["parsing"]
+            history = [((image_path,), None)]
+            task_hist = [((image_path,), None)]
             return history, task_hist, prompt
+        # Example 3: Information Extraction
         def load_example_3(history, task_hist):
+            prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
+            image_path = EXAMPLE_IMAGES["ie"]
+            history = [((image_path,), None)]
+            task_hist = [((image_path,), None)]
             return history, task_hist, prompt
+        # Example 4: Visual Q&A
         def load_example_4(history, task_hist):
+            prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
+            image_path = EXAMPLE_IMAGES["vqa"]
+            history = [((image_path,), None)]
+            task_hist = [((image_path,), None)]
             return history, task_hist, prompt
+        # Example 5: Translation
         def load_example_5(history, task_hist):
+            prompt = "Translate all text in this image to English."
+            image_path = EXAMPLE_IMAGES["translation"]
+            history = [((image_path,), None)]
+            task_hist = [((image_path,), None)]
             return history, task_hist, prompt
+        # Bind events
         example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
         example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
         example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
         regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
         addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
+        # Feature descriptions
         with gr.Row():
             with gr.Column(scale=1):
+                gr.Markdown("""
+### Core Features
+- **Text Detection & Recognition** - Multi-scene text detection and recognition
+- **Document Parsing** - Automatic document structure recognition
+- **Information Extraction** - Extract structured data from receipts and forms
+- **Visual Q&A** - Text-centric open-ended question answering
+- **Translation** - Translate text in images across 14+ languages
                 """)
             with gr.Column(scale=1):
+                gr.Markdown("""
+### Usage Tips
+- **Inference** - For production, use VLLM for better performance
+- **Image Quality** - Ensure images are clear, well-lit, and not heavily skewed
+- **File Size** - Recommended max 10MB per image, JPG/PNG format
+- **Use Cases** - OCR, document digitization, receipt recognition, translation
                 """)
+        # Footer
+        gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*")
     demo.queue().launch(
         share=args.share,

examples/ie.jpg ADDED Viewed

examples/parsing.jpg ADDED Viewed

examples/spotting.jpg ADDED Viewed

examples/translation.jpg ADDED Viewed

examples/vqa.jpg ADDED Viewed