aleclyu's picture
fix zerogpu error
d87f42b
raw
history blame
31.5 kB
import os
import torch
import numpy as np
from PIL import Image
import spaces
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info # 请确保该模块在你的环境可用
from transformers import HunYuanVLForConditionalGeneration
import gradio as gr
from argparse import ArgumentParser
import copy
import requests
from io import BytesIO
import tempfile
import hashlib
import gc
def _get_args():
parser = ArgumentParser()
parser.add_argument('-c',
'--checkpoint-path',
type=str,
default='tencent/HunyuanOCR',
help='Checkpoint name or path, default to %(default)r')
parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
parser.add_argument('--flash-attn2',
action='store_true',
default=False,
help='Enable flash_attention_2 when loading the model.')
parser.add_argument('--share',
action='store_true',
default=False,
help='Create a publicly shareable link for the interface.')
parser.add_argument('--inbrowser',
action='store_true',
default=False,
help='Automatically launch the interface in a new tab on the default browser.')
args = parser.parse_args()
return args
def _load_model_processor(args):
# ZeroGPU 环境:模型在 CPU 上加载,使用 eager 模式
# 在 @spaces.GPU 装饰器内会自动移到 GPU
print(f"[INFO] 加载模型(ZeroGPU 环境使用 eager 模式)")
model = HunYuanVLForConditionalGeneration.from_pretrained(
args.checkpoint_path,
attn_implementation="eager", # ZeroGPU 必须用 eager,因为初始在 CPU
torch_dtype=torch.bfloat16,
device_map="auto", # 改回 auto,让 ZeroGPU 自动管理
token=os.environ.get('HF_TOKEN')
)
processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
print(f"[INFO] 模型加载完成")
return model, processor
def _parse_text(text):
"""解析文本,处理特殊格式"""
# if text is None:
# return text
text = text.replace("<trans>", "").replace("</trans>", "")
return text
def _remove_image_special(text):
"""移除图像特殊标记"""
# if text is None:
# return text
# # 移除可能的图像特殊标记
# import re
# text = re.sub(r'<image>|</image>|<img>|</img>', '', text)
# return text
return text
def _gc():
"""垃圾回收"""
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
def _launch_demo(args, model, processor):
# 关键修复:移除 model 和 processor 参数,使用闭包访问
@spaces.GPU(duration=60)
def call_local_model(messages):
import time
start_time = time.time()
print(f"[DEBUG] ========== 开始推理 ==========")
# 关键:检查并确保模型在 GPU 上
model_device = next(model.parameters()).device
print(f"[DEBUG] Model device: {model_device}")
if str(model_device) == 'cpu':
print(f"[ERROR] 模型在 CPU 上!尝试移动到 GPU...")
model.cuda()
print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
messages = [messages]
# 使用 processor 构造输入格式
texts = [
processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
for msg in messages
]
print(f"[DEBUG] 模板构建完成,耗时: {time.time() - start_time:.2f}s")
image_inputs, video_inputs = process_vision_info(messages)
print(f"[DEBUG] 图像处理完成,耗时: {time.time() - start_time:.2f}s")
inputs = processor(
text=texts,
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
)
# 确保输入在 GPU 上
inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
print(f"[DEBUG] 输入准备完成,耗时: {time.time() - start_time:.2f}s")
print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
print(f"[DEBUG] Input device: {inputs.input_ids.device}")
# 生成
gen_start = time.time()
with torch.no_grad():
generated_ids = model.generate(
**inputs,
max_new_tokens=256,
repetition_penalty=1.03,
do_sample=False,
eos_token_id=processor.tokenizer.eos_token_id,
pad_token_id=processor.tokenizer.pad_token_id,
)
gen_time = time.time() - gen_start
print(f"[DEBUG] ========== 生成完成 ==========")
print(f"[DEBUG] 生成耗时: {gen_time:.2f}s")
print(f"[DEBUG] Output shape: {generated_ids.shape}")
# 解码输出
if "input_ids" in inputs:
input_ids = inputs.input_ids
else:
input_ids = inputs.inputs
generated_ids_trimmed = [
out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
]
actual_tokens = len(generated_ids_trimmed[0])
print(f"[DEBUG] 实际生成 token 数: {actual_tokens}")
print(f"[DEBUG] 每 token 耗时: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")
output_texts = processor.batch_decode(
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
total_time = time.time() - start_time
print(f"[DEBUG] ========== 全部完成 ==========")
print(f"[DEBUG] 总耗时: {total_time:.2f}s")
print(f"[DEBUG] 输出长度: {len(output_texts[0])} 字符")
print(f"[DEBUG] 输出预览: {output_texts[0][:100]}...")
return output_texts
def create_predict_fn():
def predict(_chatbot, task_history):
nonlocal model, processor
chat_query = _chatbot[-1][0]
query = task_history[-1][0]
if len(chat_query) == 0:
_chatbot.pop()
task_history.pop()
return _chatbot
print('User: ', query)
history_cp = copy.deepcopy(task_history)
full_response = ''
messages = []
content = []
for q, a in history_cp:
if isinstance(q, (tuple, list)):
# 判断是URL还是本地路径
img_path = q[0]
if img_path.startswith(('http://', 'https://')):
content.append({'type': 'image', 'image': img_path})
else:
content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
else:
content.append({'type': 'text', 'text': q})
messages.append({'role': 'user', 'content': content})
messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
content = []
messages.pop()
# 调用模型获取响应(已修改:不再传递 model 和 processor)
response_list = call_local_model(messages)
response = response_list[0] if response_list else ""
_chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
full_response = _parse_text(response)
task_history[-1] = (query, full_response)
print('HunyuanOCR: ' + _parse_text(full_response))
yield _chatbot
return predict
def create_regenerate_fn():
def regenerate(_chatbot, task_history):
nonlocal model, processor
if not task_history:
return _chatbot
item = task_history[-1]
if item[1] is None:
return _chatbot
task_history[-1] = (item[0], None)
chatbot_item = _chatbot.pop(-1)
if chatbot_item[0] is None:
_chatbot[-1] = (_chatbot[-1][0], None)
else:
_chatbot.append((chatbot_item[0], None))
# 使用外层的predict函数
_chatbot_gen = predict(_chatbot, task_history)
for _chatbot in _chatbot_gen:
yield _chatbot
return regenerate
predict = create_predict_fn()
regenerate = create_regenerate_fn()
def add_text(history, task_history, text):
task_text = text
history = history if history is not None else []
task_history = task_history if task_history is not None else []
history = history + [(_parse_text(text), None)]
task_history = task_history + [(task_text, None)]
return history, task_history, ''
def add_file(history, task_history, file):
history = history if history is not None else []
task_history = task_history if task_history is not None else []
history = history + [((file.name,), None)]
task_history = task_history + [((file.name,), None)]
return history, task_history
def download_url_image(url):
"""下载 URL 图片到本地临时文件"""
try:
# 使用 URL 的哈希值作为文件名,避免重复下载
url_hash = hashlib.md5(url.encode()).hexdigest()
temp_dir = tempfile.gettempdir()
temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")
# 如果文件已存在,直接返回
if os.path.exists(temp_path):
return temp_path
# 下载图片
response = requests.get(url, timeout=10)
response.raise_for_status()
with open(temp_path, 'wb') as f:
f.write(response.content)
return temp_path
except Exception as e:
print(f"下载图片失败: {url}, 错误: {e}")
return url # 失败时返回原 URL
def reset_user_input():
return gr.update(value='')
def reset_state(_chatbot, task_history):
task_history.clear()
_chatbot.clear()
_gc()
return []
# 示例图片路径配置 - 请替换为实际图片路径
EXAMPLE_IMAGES = {
"spotting": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/23cc43af9376b948f3febaf4ce854a8a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523817%3B1794627877&q-key-time=1763523817%3B1794627877&q-header-list=host&q-url-param-list=&q-signature=8ebd6a9d3ed7eba73bb783c337349db9c29972e2", # TODO: 替换为场景文字示例图片路径
"parsing": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/c4997ebd1be9f7c3e002fabba8b46cb7.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=d2cd12be4c7902821c8c82203e4642624046911a",
"ie": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/7c67c0f78e4423d51644a325da1f8e85.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=803648f3253706f654faf1423869fd9e00e7056e",
"vqa": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/fea0865d1c70c53aaa2ab91cd0e787f5.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=a92b94e298a11aea130d730d3b16ee761acc3f4c",
"translation": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/d1af99d35e9db9e820ebebb5bc68993a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763967603%3B1795071663&q-key-time=1763967603%3B1795071663&q-header-list=host&q-url-param-list=&q-signature=a57080c0b3d4c76ea74b88c6291f9004241c9d49",
# "spotting": "examples/spotting.jpg",
# "parsing": "examples/parsing.jpg",
# "ie": "examples/ie.jpg",
# "vqa": "examples/vqa.jpg",
# "translation": "examples/translation.jpg"
}
with gr.Blocks(css="""
body {
background: #f5f7fa;
}
.gradio-container {
max-width: 100% !important;
padding: 0 40px !important;
}
.header-section {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 30px 0;
margin: -20px -40px 30px -40px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
}
.header-content {
max-width: 1600px;
margin: 0 auto;
padding: 0 40px;
display: flex;
align-items: center;
gap: 20px;
}
.header-logo {
height: 60px;
}
.header-text h1 {
color: white;
font-size: 32px;
font-weight: bold;
margin: 0 0 5px 0;
}
.header-text p {
color: rgba(255,255,255,0.9);
margin: 0;
font-size: 14px;
}
.main-container {
max-width: 1800px;
margin: 0 auto;
}
.chatbot {
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
border-radius: 12px !important;
border: 1px solid #e5e7eb !important;
background: white !important;
}
.input-panel {
background: white;
padding: 20px;
border-radius: 12px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
border: 1px solid #e5e7eb;
}
.input-box textarea {
border: 2px solid #e5e7eb !important;
border-radius: 8px !important;
font-size: 14px !important;
}
.input-box textarea:focus {
border-color: #667eea !important;
}
.btn-primary {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
border: none !important;
color: white !important;
font-weight: 500 !important;
padding: 10px 24px !important;
font-size: 14px !important;
}
.btn-primary:hover {
transform: translateY(-1px) !important;
box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
}
.btn-secondary {
background: white !important;
border: 2px solid #667eea !important;
color: #667eea !important;
padding: 8px 20px !important;
font-size: 14px !important;
}
.btn-secondary:hover {
background: #f0f4ff !important;
}
.example-grid {
display: grid;
grid-template-columns: repeat(4, 1fr);
gap: 20px;
margin-top: 30px;
}
.example-card {
background: white;
border-radius: 12px;
overflow: hidden;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
border: 1px solid #e5e7eb;
transition: all 0.3s ease;
}
.example-card:hover {
transform: translateY(-4px);
box-shadow: 0 8px 20px rgba(102, 126, 234, 0.15);
border-color: #667eea;
}
.example-image-wrapper {
width: 100%;
height: 180px;
overflow: hidden;
background: #f5f7fa;
}
.example-image-wrapper img {
width: 100%;
height: 100%;
object-fit: cover;
}
.example-btn {
width: 100% !important;
white-space: pre-wrap !important;
text-align: left !important;
padding: 16px !important;
background: white !important;
border: none !important;
border-top: 1px solid #e5e7eb !important;
color: #1f2937 !important;
font-size: 14px !important;
line-height: 1.6 !important;
transition: all 0.3s ease !important;
font-weight: 500 !important;
}
.example-btn:hover {
background: #f9fafb !important;
color: #667eea !important;
}
.feature-section {
background: white;
padding: 24px;
border-radius: 12px;
margin-top: 30px;
box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
border: 1px solid #e5e7eb;
}
.section-title {
font-size: 18px;
font-weight: 600;
color: #1f2937;
margin-bottom: 20px;
padding-bottom: 12px;
border-bottom: 2px solid #e5e7eb;
}
""") as demo:
# 顶部导航栏
gr.HTML("""
<div class="header-section">
<div class="header-content">
<img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/6ef6928b21b323b2b00115f86a779d8f.png?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763450355%3B1794554415&q-key-time=1763450355%3B1794554415&q-header-list=host&q-url-param-list=&q-signature=41328696dc34571324aa18c791c1196192e729c6" class="header-logo"/>
<div class="header-text">
<h1>HunyuanOCR</h1>
<p>Powered by Tencent Hunyuan Team</p>
</div>
</div>
</div>
""")
with gr.Column(elem_classes=["main-container"]):
# 对话区域 - 全宽
chatbot = gr.Chatbot(
label='💬 对话窗口',
height=600,
bubble_full_width=False,
layout="bubble",
show_copy_button=True,
avatar_images=(None, "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/6ef6928b21b323b2b00115f86a779d8f.png?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763450355%3B1794554415&q-key-time=1763450355%3B1794554415&q-header-list=host&q-url-param-list=&q-signature=41328696dc34571324aa18c791c1196192e729c6"),
elem_classes=["chatbot"]
)
# 输入控制面板 - 全宽
with gr.Group(elem_classes=["input-panel"]):
query = gr.Textbox(
lines=2,
label='💭 输入您的问题',
placeholder='请先上传图片,然后输入问题。例如:检测并识别图片中的文字,将文本坐标格式化输出。',
elem_classes=["input-box"],
show_label=False
)
with gr.Row():
addfile_btn = gr.UploadButton('📁 上传图片', file_types=['image'], elem_classes=["btn-secondary"])
submit_btn = gr.Button('🚀 发送消息', variant="primary", elem_classes=["btn-primary"], scale=3)
regen_btn = gr.Button('🔄 重新生成', elem_classes=["btn-secondary"])
empty_bin = gr.Button('🗑️ 清空对话', elem_classes=["btn-secondary"])
# 示例区域 - 5列网格布局
gr.HTML('<div class="section-title">📚 快速体验示例 - 点击下方卡片快速加载</div>')
with gr.Row():
# 示例1:spotting
with gr.Column(scale=1):
with gr.Group(elem_classes=["example-card"]):
gr.HTML("""
<div class="example-image-wrapper">
<img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/23cc43af9376b948f3febaf4ce854a8a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523817%3B1794627877&q-key-time=1763523817%3B1794627877&q-header-list=host&q-url-param-list=&q-signature=8ebd6a9d3ed7eba73bb783c337349db9c29972e2" alt="文字检测识别"/>
</div>
""")
example_1_btn = gr.Button("🔍 文字检测和识别", elem_classes=["example-btn"])
# 示例2:parsing
with gr.Column(scale=1):
with gr.Group(elem_classes=["example-card"]):
gr.HTML("""
<div class="example-image-wrapper">
<img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/c4997ebd1be9f7c3e002fabba8b46cb7.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=d2cd12be4c7902821c8c82203e4642624046911a" alt="文档解析"/>
</div>
""")
example_2_btn = gr.Button("📋 文档解析", elem_classes=["example-btn"])
# 示例3:ie
with gr.Column(scale=1):
with gr.Group(elem_classes=["example-card"]):
gr.HTML("""
<div class="example-image-wrapper">
<img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/7c67c0f78e4423d51644a325da1f8e85.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=803648f3253706f654faf1423869fd9e00e7056e" alt="信息抽取"/>
</div>
""")
example_3_btn = gr.Button("🎯 信息抽取", elem_classes=["example-btn"])
# 示例4:VQA
with gr.Column(scale=1):
with gr.Group(elem_classes=["example-card"]):
gr.HTML("""
<div class="example-image-wrapper">
<img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/fea0865d1c70c53aaa2ab91cd0e787f5.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763523818%3B1794627878&q-key-time=1763523818%3B1794627878&q-header-list=host&q-url-param-list=&q-signature=a92b94e298a11aea130d730d3b16ee761acc3f4c" alt="视觉问答"/>
</div>
""")
example_4_btn = gr.Button("💬 视觉问答", elem_classes=["example-btn"])
# 示例5:translation
with gr.Column(scale=1):
with gr.Group(elem_classes=["example-card"]):
gr.HTML("""
<div class="example-image-wrapper">
<img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/d1af99d35e9db9e820ebebb5bc68993a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763967603%3B1795071663&q-key-time=1763967603%3B1795071663&q-header-list=host&q-url-param-list=&q-signature=a57080c0b3d4c76ea74b88c6291f9004241c9d49" alt="图片翻译"/>
</div>
""")
example_5_btn = gr.Button("🌐 图片翻译", elem_classes=["example-btn"])
task_history = gr.State([])
# 示例1:文档识别
def load_example_1(history, task_hist):
prompt = "检测并识别图片中的文字,将文本坐标格式化输出。"
image_url = EXAMPLE_IMAGES["spotting"]
# 下载 URL 图片到本地
image_path = download_url_image(image_url)
# 清空对话历史
history = []
task_hist = []
history = history + [((image_path,), None)]
task_hist = task_hist + [((image_path,), None)]
return history, task_hist, prompt
# 示例2:场景文字
def load_example_2(history, task_hist):
prompt = "提取文档图片中正文的所有信息用markdown 格式表示,其中页眉、页脚部分忽略,表格用html 格式表达,文档中公式用latex 格式表示,按照阅读顺序组织进行解析。"
image_url = EXAMPLE_IMAGES["parsing"]
# 下载 URL 图片到本地
image_path = download_url_image(image_url)
# 清空对话历史
history = []
task_hist = []
history = history + [((image_path,), None)]
task_hist = task_hist + [((image_path,), None)]
return history, task_hist, prompt
# 示例3:表格提取
def load_example_3(history, task_hist):
prompt = "提取图片中的:['单价', '上车时间','发票号码', '省前缀', '总金额', '发票代码', '下车时间', '里程数'] 的字段内容,并且按照JSON格式返回。"
image_url = EXAMPLE_IMAGES["ie"]
# 下载 URL 图片到本地
image_path = download_url_image(image_url)
# 清空对话历史
history = []
task_hist = []
history = history + [((image_path,), None)]
task_hist = task_hist + [((image_path,), None)]
return history, task_hist, prompt
# 示例4:手写体
def load_example_4(history, task_hist):
prompt = "What is the highest life expectancy at birth of male?"
image_url = EXAMPLE_IMAGES["vqa"]
# 下载 URL 图片到本地
image_path = download_url_image(image_url)
# 清空对话历史
history = []
task_hist = []
history = history + [((image_path,), None)]
task_hist = task_hist + [((image_path,), None)]
return history, task_hist, prompt
# 示例5:翻译
def load_example_5(history, task_hist):
prompt = "将图中文字翻译为中文。"
image_url = EXAMPLE_IMAGES["translation"]
# 下载 URL 图片到本地
image_path = download_url_image(image_url)
# 清空对话历史
history = []
task_hist = []
history = history + [((image_path,), None)]
task_hist = task_hist + [((image_path,), None)]
return history, task_hist, prompt
# 绑定事件
example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])
submit_btn.click(add_text, [chatbot, task_history, query],
[chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
submit_btn.click(reset_user_input, [], [query])
empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
# 功能说明区域
with gr.Row():
with gr.Column(scale=1):
gr.HTML("""
<div class="feature-section">
<div class="section-title">✨ 核心功能</div>
<ul style="line-height: 2; color: #4b5563; font-size: 14px; margin: 0; padding-left: 20px;">
<li><strong>🎯 高精度文字检测识别</strong> - 支持多场景文字检测与识别</li>
<li><strong>📐 智能文档解析</strong> - 自动识别文档结构,支持多粒度文档解析</li>
<li><strong>📋 信息提取</strong> - 支持30+高频卡证票据识别和结构化输出</li>
<li><strong>✏️ 视觉问答</strong> - 支持以文本为中心的开放式问答</li>
<li><strong>🌍 跨语言翻译</strong> - 支持中英互译及14+语种译为中英文</li>
</ul>
</div>
""")
with gr.Column(scale=1):
gr.HTML("""
<div class="feature-section">
<div class="section-title">💡 使用建议</div>
<ul style="line-height: 2; color: #4b5563; font-size: 14px; margin: 0; padding-left: 20px;">
<li><strong>图片质量</strong> - 确保图片清晰,光线充足,分辨率适中</li>
<li><strong>拍摄角度</strong> - 避免严重倾斜、遮挡或反光,正面拍摄效果最佳</li>
<li><strong>文件大小</strong> - 建议单张图片不超过 10MB,支持 JPG/PNG 格式</li>
<li><strong>使用场景</strong> - 适用于文字检测识别、文档数字化、票据识别、信息提取、文字图片翻译等</li>
<li><strong>合规使用</strong> - 仅供学习研究,请遵守法律法规,尊重隐私权</li>
</ul>
</div>
""")
# 底部版权信息
gr.HTML("""
<div style="text-align: center; color: #9ca3af; font-size: 13px; margin-top: 40px; padding: 20px; border-top: 1px solid #e5e7eb;">
<p style="margin: 0;">© 2025 Tencent Hunyuan Team. All rights reserved.</p>
<p style="margin: 5px 0 0 0;">本系统基于 HunyuanOCR 构建 | 仅供学习研究使用</p>
</div>
""")
demo.queue().launch(
share=args.share,
inbrowser=args.inbrowser,
# server_port=args.server_port,
# server_name=args.server_name,
)
def main():
args = _get_args()
model, processor = _load_model_processor(args)
_launch_demo(args, model, processor)
if __name__ == '__main__':
main()