aleclyu commited on
Commit
5d5f953
·
1 Parent(s): 586b09a

init commit

Browse files
Files changed (2) hide show
  1. app.py +652 -0
  2. requirements.txt +12 -0
app.py ADDED
@@ -0,0 +1,652 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import numpy as np
4
+ from PIL import Image
5
+ from transformers import AutoProcessor
6
+ from qwen_vl_utils import process_vision_info # 请确保该模块在你的环境可用
7
+ from transformers import HunYuanVLForConditionalGeneration
8
+ import gradio as gr
9
+ from argparse import ArgumentParser
10
+ import copy
11
+ import requests
12
+ from io import BytesIO
13
+ import tempfile
14
+ import hashlib
15
+ import gc
16
+
17
+
18
+
19
+
20
+ def _get_args():
21
+ parser = ArgumentParser()
22
+
23
+ parser.add_argument('-c',
24
+ '--checkpoint-path',
25
+ type=str,
26
+ default='tencent/HunyuanOCR',
27
+ help='Checkpoint name or path, default to %(default)r')
28
+ parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')
29
+
30
+ parser.add_argument('--flash-attn2',
31
+ action='store_true',
32
+ default=False,
33
+ help='Enable flash_attention_2 when loading the model.')
34
+ parser.add_argument('--share',
35
+ action='store_true',
36
+ default=False,
37
+ help='Create a publicly shareable link for the interface.')
38
+ parser.add_argument('--inbrowser',
39
+ action='store_true',
40
+ default=False,
41
+ help='Automatically launch the interface in a new tab on the default browser.')
42
+ # parser.add_argument('--server-port', type=int, default=8080, help='Demo server port.')
43
+ # parser.add_argument('--server-name', type=str, default='29.210.129.176', help='Demo server name.')
44
+
45
+ args = parser.parse_args()
46
+ return args
47
+
48
+
49
+ def _load_model_processor(args):
50
+ model = HunYuanVLForConditionalGeneration.from_pretrained(
51
+ args.checkpoint_path,
52
+ attn_implementation="eager", # "flash_attention_2", #也可以是 flash_attention_2 或 sdpa,根据你的环境支持情况选择
53
+ torch_dtype=torch.bfloat16,
54
+ device_map="auto",
55
+ token=os.environ.get('HF_TOKEN')
56
+ )
57
+ processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
58
+ return model, processor
59
+
60
+
61
+ def _parse_text(text):
62
+ """解析文本,处理特殊格式"""
63
+ # if text is None:
64
+ # return text
65
+ text = text.replace("<trans>", "").replace("</trans>", "")
66
+ return text
67
+
68
+
69
+ def _remove_image_special(text):
70
+ """移除图像特殊标记"""
71
+ # if text is None:
72
+ # return text
73
+ # # 移除可能的图像特殊标记
74
+ # import re
75
+ # text = re.sub(r'<image>|</image>|<img>|</img>', '', text)
76
+ # return text
77
+ return text
78
+
79
+
80
+ def _gc():
81
+ """垃圾回收"""
82
+ gc.collect()
83
+ if torch.cuda.is_available():
84
+ torch.cuda.empty_cache()
85
+
86
+
87
+ def _launch_demo(args, model, processor):
88
+ def call_local_model(model, processor, messages):
89
+ print(messages)
90
+ messages = [messages]
91
+ # 使用 processor 构造输入格式
92
+ texts = [
93
+ processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
94
+ for msg in messages
95
+ ]
96
+ image_inputs, video_inputs = process_vision_info(messages)
97
+ inputs = processor(
98
+ text=texts,
99
+ images=image_inputs,
100
+ videos=video_inputs,
101
+ padding=True,
102
+ return_tensors="pt",
103
+ )
104
+ inputs = inputs.to(model.device)
105
+
106
+
107
+ # gen_kwargs = {'max_new_tokens': 32768, 'streamer': streamer, **inputs}
108
+ # thread = Thread(target=model.generate, kwargs=gen_kwargs)
109
+ # thread.start()
110
+
111
+ # generated_text = ''
112
+ # for new_text in streamer:
113
+ # generated_text += new_text
114
+ # yield generated_text
115
+
116
+ # 模型推理
117
+ with torch.no_grad():
118
+ generated_ids = model.generate(
119
+ **inputs,
120
+ max_new_tokens=1024*8,
121
+ repetition_penalty=1.03,
122
+ do_sample=False
123
+ )
124
+
125
+ # 解码输出
126
+ if "input_ids" in inputs:
127
+ input_ids = inputs.input_ids
128
+ else:
129
+ input_ids = inputs.inputs # fallback
130
+
131
+ generated_ids_trimmed = [
132
+ out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
133
+ ]
134
+
135
+ output_texts = processor.batch_decode(
136
+ generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
137
+ )
138
+
139
+ return output_texts
140
+
141
+
142
+ def create_predict_fn():
143
+
144
+ def predict(_chatbot, task_history):
145
+ nonlocal model, processor
146
+ chat_query = _chatbot[-1][0]
147
+ query = task_history[-1][0]
148
+ if len(chat_query) == 0:
149
+ _chatbot.pop()
150
+ task_history.pop()
151
+ return _chatbot
152
+ print('User: ', query)
153
+ history_cp = copy.deepcopy(task_history)
154
+ full_response = ''
155
+ messages = []
156
+ content = []
157
+ for q, a in history_cp:
158
+ if isinstance(q, (tuple, list)):
159
+ # 判断是URL还是本地路径
160
+ img_path = q[0]
161
+ if img_path.startswith(('http://', 'https://')):
162
+ content.append({'type': 'image', 'image': img_path})
163
+ else:
164
+ content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
165
+ else:
166
+ content.append({'type': 'text', 'text': q})
167
+ messages.append({'role': 'user', 'content': content})
168
+ messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
169
+ content = []
170
+ messages.pop()
171
+
172
+ # 调用模型获取响应
173
+ response_list = call_local_model(model, processor, messages)
174
+ response = response_list[0] if response_list else ""
175
+
176
+ _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
177
+ full_response = _parse_text(response)
178
+
179
+ task_history[-1] = (query, full_response)
180
+ print('HunyuanOCR: ' + _parse_text(full_response))
181
+ yield _chatbot
182
+
183
+ return predict
184
+
185
+ def create_regenerate_fn():
186
+
187
+ def regenerate(_chatbot, task_history):
188
+ nonlocal model, processor
189
+ if not task_history:
190
+ return _chatbot
191
+ item = task_history[-1]
192
+ if item[1] is None:
193
+ return _chatbot
194
+ task_history[-1] = (item[0], None)
195
+ chatbot_item = _chatbot.pop(-1)
196
+ if chatbot_item[0] is None:
197
+ _chatbot[-1] = (_chatbot[-1][0], None)
198
+ else:
199
+ _chatbot.append((chatbot_item[0], None))
200
+ # 使用外层的predict函数
201
+ _chatbot_gen = predict(_chatbot, task_history)
202
+ for _chatbot in _chatbot_gen:
203
+ yield _chatbot
204
+
205
+ return regenerate
206
+
207
+ predict = create_predict_fn()
208
+ regenerate = create_regenerate_fn()
209
+
210
+ def add_text(history, task_history, text):
211
+ task_text = text
212
+ history = history if history is not None else []
213
+ task_history = task_history if task_history is not None else []
214
+ history = history + [(_parse_text(text), None)]
215
+ task_history = task_history + [(task_text, None)]
216
+ return history, task_history, ''
217
+
218
+ def add_file(history, task_history, file):
219
+ history = history if history is not None else []
220
+ task_history = task_history if task_history is not None else []
221
+ history = history + [((file.name,), None)]
222
+ task_history = task_history + [((file.name,), None)]
223
+ return history, task_history
224
+
225
+ def download_url_image(url):
226
+ """下载 URL 图片到本地临时文件"""
227
+ try:
228
+ # 使用 URL 的哈希值作为文件名,避免重复下载
229
+ url_hash = hashlib.md5(url.encode()).hexdigest()
230
+ temp_dir = tempfile.gettempdir()
231
+ temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.jpg")
232
+
233
+ # 如果文件已存在,直接返回
234
+ if os.path.exists(temp_path):
235
+ return temp_path
236
+
237
+ # 下载图片
238
+ response = requests.get(url, timeout=10)
239
+ response.raise_for_status()
240
+ img = Image.open(BytesIO(response.content)).convert('RGB')
241
+ img.save(temp_path)
242
+ return temp_path
243
+ except Exception as e:
244
+ print(f"下载图片失败: {url}, 错误: {e}")
245
+ return url # 失败时返回原 URL
246
+
247
+ def reset_user_input():
248
+ return gr.update(value='')
249
+
250
+ def reset_state(_chatbot, task_history):
251
+ task_history.clear()
252
+ _chatbot.clear()
253
+ _gc()
254
+ return []
255
+
256
+ # 示例图片路径配置 - 请替换为实际图片路径
257
+ EXAMPLE_IMAGES = {
258
+ "spotting": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/23cc43af9376b948f3febaf4ce854a8a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763452661%3B1794556721&q-key-time=1763452661%3B1794556721&q-header-list=host&q-url-param-list=&q-signature=f39c909f209d2b84e3de648e2842942ad5a47d7a", # TODO: 替换为场景文字示例图片路径
259
+ "parsing": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/c4997ebd1be9f7c3e002fabba8b46cb7.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455327%3B1794559387&q-key-time=1763455327%3B1794559387&q-header-list=host&q-url-param-list=&q-signature=6a4c093087ab2c76bca363456b70831d1304bc67",
260
+ "ie": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/7c67c0f78e4423d51644a325da1f8e85.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455327%3B1794559387&q-key-time=1763455327%3B1794559387&q-header-list=host&q-url-param-list=&q-signature=10aceb21db90dc61e843103f9316f975719ea84d",
261
+ "vqa": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/fea0865d1c70c53aaa2ab91cd0e787f5.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455328%3B1794559388&q-key-time=1763455328%3B1794559388&q-header-list=host&q-url-param-list=&q-signature=09f62488e9fd33f09795de0faf9b855f95299466",
262
+ "translation": "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/1bdacfd77c09f20ec8bc043933b815d6.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455328%3B1794559388&q-key-time=1763455328%3B1794559388&q-header-list=host&q-url-param-list=&q-signature=c7214858ebd48824565cd21898a32d0464373009",
263
+ # "spotting": "examples/spotting.jpg",
264
+ # "parsing": "examples/parsing.jpg",
265
+ # "ie": "examples/ie.jpg",
266
+ # "vqa": "examples/vqa.jpg",
267
+ # "translation": "examples/translation.jpg"
268
+ }
269
+
270
+ with gr.Blocks(css="""
271
+ body {
272
+ background: #f5f7fa;
273
+ }
274
+ .gradio-container {
275
+ max-width: 100% !important;
276
+ padding: 0 40px !important;
277
+ }
278
+ .header-section {
279
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
280
+ padding: 30px 0;
281
+ margin: -20px -40px 30px -40px;
282
+ box-shadow: 0 2px 10px rgba(0,0,0,0.1);
283
+ }
284
+ .header-content {
285
+ max-width: 1600px;
286
+ margin: 0 auto;
287
+ padding: 0 40px;
288
+ display: flex;
289
+ align-items: center;
290
+ gap: 20px;
291
+ }
292
+ .header-logo {
293
+ height: 60px;
294
+ }
295
+ .header-text h1 {
296
+ color: white;
297
+ font-size: 32px;
298
+ font-weight: bold;
299
+ margin: 0 0 5px 0;
300
+ }
301
+ .header-text p {
302
+ color: rgba(255,255,255,0.9);
303
+ margin: 0;
304
+ font-size: 14px;
305
+ }
306
+ .main-container {
307
+ max-width: 1800px;
308
+ margin: 0 auto;
309
+ }
310
+ .chatbot {
311
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08) !important;
312
+ border-radius: 12px !important;
313
+ border: 1px solid #e5e7eb !important;
314
+ background: white !important;
315
+ }
316
+ .input-panel {
317
+ background: white;
318
+ padding: 20px;
319
+ border-radius: 12px;
320
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
321
+ border: 1px solid #e5e7eb;
322
+ }
323
+ .input-box textarea {
324
+ border: 2px solid #e5e7eb !important;
325
+ border-radius: 8px !important;
326
+ font-size: 14px !important;
327
+ }
328
+ .input-box textarea:focus {
329
+ border-color: #667eea !important;
330
+ }
331
+ .btn-primary {
332
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%) !important;
333
+ border: none !important;
334
+ color: white !important;
335
+ font-weight: 500 !important;
336
+ padding: 10px 24px !important;
337
+ font-size: 14px !important;
338
+ }
339
+ .btn-primary:hover {
340
+ transform: translateY(-1px) !important;
341
+ box-shadow: 0 4px 12px rgba(102, 126, 234, 0.4) !important;
342
+ }
343
+ .btn-secondary {
344
+ background: white !important;
345
+ border: 2px solid #667eea !important;
346
+ color: #667eea !important;
347
+ padding: 8px 20px !important;
348
+ font-size: 14px !important;
349
+ }
350
+ .btn-secondary:hover {
351
+ background: #f0f4ff !important;
352
+ }
353
+ .example-grid {
354
+ display: grid;
355
+ grid-template-columns: repeat(4, 1fr);
356
+ gap: 20px;
357
+ margin-top: 30px;
358
+ }
359
+ .example-card {
360
+ background: white;
361
+ border-radius: 12px;
362
+ overflow: hidden;
363
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
364
+ border: 1px solid #e5e7eb;
365
+ transition: all 0.3s ease;
366
+ }
367
+ .example-card:hover {
368
+ transform: translateY(-4px);
369
+ box-shadow: 0 8px 20px rgba(102, 126, 234, 0.15);
370
+ border-color: #667eea;
371
+ }
372
+ .example-image-wrapper {
373
+ width: 100%;
374
+ height: 180px;
375
+ overflow: hidden;
376
+ background: #f5f7fa;
377
+ }
378
+ .example-image-wrapper img {
379
+ width: 100%;
380
+ height: 100%;
381
+ object-fit: cover;
382
+ }
383
+ .example-btn {
384
+ width: 100% !important;
385
+ white-space: pre-wrap !important;
386
+ text-align: left !important;
387
+ padding: 16px !important;
388
+ background: white !important;
389
+ border: none !important;
390
+ border-top: 1px solid #e5e7eb !important;
391
+ color: #1f2937 !important;
392
+ font-size: 14px !important;
393
+ line-height: 1.6 !important;
394
+ transition: all 0.3s ease !important;
395
+ font-weight: 500 !important;
396
+ }
397
+ .example-btn:hover {
398
+ background: #f9fafb !important;
399
+ color: #667eea !important;
400
+ }
401
+ .feature-section {
402
+ background: white;
403
+ padding: 24px;
404
+ border-radius: 12px;
405
+ margin-top: 30px;
406
+ box-shadow: 0 2px 8px rgba(0, 0, 0, 0.08);
407
+ border: 1px solid #e5e7eb;
408
+ }
409
+ .section-title {
410
+ font-size: 18px;
411
+ font-weight: 600;
412
+ color: #1f2937;
413
+ margin-bottom: 20px;
414
+ padding-bottom: 12px;
415
+ border-bottom: 2px solid #e5e7eb;
416
+ }
417
+ """) as demo:
418
+ # 顶部导航栏
419
+ gr.HTML("""
420
+ <div class="header-section">
421
+ <div class="header-content">
422
+ <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/6ef6928b21b323b2b00115f86a779d8f.png?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763450355%3B1794554415&q-key-time=1763450355%3B1794554415&q-header-list=host&q-url-param-list=&q-signature=41328696dc34571324aa18c791c1196192e729c6" class="header-logo"/>
423
+ <div class="header-text">
424
+ <h1>HunyuanOCR</h1>
425
+ <p>Powered by Tencent Hunyuan Team</p>
426
+ </div>
427
+ </div>
428
+ </div>
429
+ """)
430
+
431
+ with gr.Column(elem_classes=["main-container"]):
432
+ # 对话区域 - 全宽
433
+ chatbot = gr.Chatbot(
434
+ label='💬 对话窗口',
435
+ height=600,
436
+ bubble_full_width=False,
437
+ layout="bubble",
438
+ show_copy_button=True,
439
+ avatar_images=(None, "https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/6ef6928b21b323b2b00115f86a779d8f.png?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763450355%3B1794554415&q-key-time=1763450355%3B1794554415&q-header-list=host&q-url-param-list=&q-signature=41328696dc34571324aa18c791c1196192e729c6"),
440
+ elem_classes=["chatbot"]
441
+ )
442
+
443
+ # 输入控制面板 - 全宽
444
+ with gr.Group(elem_classes=["input-panel"]):
445
+ query = gr.Textbox(
446
+ lines=2,
447
+ label='💭 输入您的问题',
448
+ placeholder='请先上传图片,然后输入问题。例如:检测并识别图片中的文字,将文本坐标格式化输出。',
449
+ elem_classes=["input-box"],
450
+ show_label=False
451
+ )
452
+
453
+ with gr.Row():
454
+ addfile_btn = gr.UploadButton('📁 上传图片', file_types=['image'], elem_classes=["btn-secondary"])
455
+ submit_btn = gr.Button('🚀 发送消息', variant="primary", elem_classes=["btn-primary"], scale=3)
456
+ regen_btn = gr.Button('🔄 重新生成', elem_classes=["btn-secondary"])
457
+ empty_bin = gr.Button('🗑️ 清空对话', elem_classes=["btn-secondary"])
458
+
459
+ # 示例区域 - 5列网格布局
460
+ gr.HTML('<div class="section-title">📚 快速体验示例 - 点击下方卡片快速加载</div>')
461
+
462
+ with gr.Row():
463
+ # 示例1:spotting
464
+ with gr.Column(scale=1):
465
+ with gr.Group(elem_classes=["example-card"]):
466
+ gr.HTML("""
467
+ <div class="example-image-wrapper">
468
+ <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/23cc43af9376b948f3febaf4ce854a8a.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763452661%3B1794556721&q-key-time=1763452661%3B1794556721&q-header-list=host&q-url-param-list=&q-signature=f39c909f209d2b84e3de648e2842942ad5a47d7a" alt="文字检测识别"/>
469
+ </div>
470
+ """)
471
+ example_1_btn = gr.Button("🔍 文字检测和识别", elem_classes=["example-btn"])
472
+
473
+ # 示例2:parsing
474
+ with gr.Column(scale=1):
475
+ with gr.Group(elem_classes=["example-card"]):
476
+ gr.HTML("""
477
+ <div class="example-image-wrapper">
478
+ <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/c4997ebd1be9f7c3e002fabba8b46cb7.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455327%3B1794559387&q-key-time=1763455327%3B1794559387&q-header-list=host&q-url-param-list=&q-signature=6a4c093087ab2c76bca363456b70831d1304bc67" alt="文档解析"/>
479
+ </div>
480
+ """)
481
+ example_2_btn = gr.Button("📋 文档解析", elem_classes=["example-btn"])
482
+
483
+ # 示例3:ie
484
+ with gr.Column(scale=1):
485
+ with gr.Group(elem_classes=["example-card"]):
486
+ gr.HTML("""
487
+ <div class="example-image-wrapper">
488
+ <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/7c67c0f78e4423d51644a325da1f8e85.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455327%3B1794559387&q-key-time=1763455327%3B1794559387&q-header-list=host&q-url-param-list=&q-signature=10aceb21db90dc61e843103f9316f975719ea84d" alt="信息抽取"/>
489
+ </div>
490
+ """)
491
+ example_3_btn = gr.Button("🎯 信息抽取", elem_classes=["example-btn"])
492
+
493
+ # 示例4:VQA
494
+ with gr.Column(scale=1):
495
+ with gr.Group(elem_classes=["example-card"]):
496
+ gr.HTML("""
497
+ <div class="example-image-wrapper">
498
+ <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/fea0865d1c70c53aaa2ab91cd0e787f5.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455328%3B1794559388&q-key-time=1763455328%3B1794559388&q-header-list=host&q-url-param-list=&q-signature=09f62488e9fd33f09795de0faf9b855f95299466" alt="视觉问答"/>
499
+ </div>
500
+ """)
501
+ example_4_btn = gr.Button("💬 视觉问答", elem_classes=["example-btn"])
502
+
503
+ # 示例5:translation
504
+ with gr.Column(scale=1):
505
+ with gr.Group(elem_classes=["example-card"]):
506
+ gr.HTML("""
507
+ <div class="example-image-wrapper">
508
+ <img src="https://hunyuan-multimodal-1258344703.cos.ap-guangzhou.myqcloud.com/hunyuan_multimodal/mllm_data/1bdacfd77c09f20ec8bc043933b815d6.jpg?q-sign-algorithm=sha1&q-ak=AKIDbLEFMUYZgyERZnygUQLC7xkQ1hTAzulX&q-sign-time=1763455328%3B1794559388&q-key-time=1763455328%3B1794559388&q-header-list=host&q-url-param-list=&q-signature=c7214858ebd48824565cd21898a32d0464373009" alt="图片翻译"/>
509
+ </div>
510
+ """)
511
+ example_5_btn = gr.Button("🌐 图片翻译", elem_classes=["example-btn"])
512
+
513
+ task_history = gr.State([])
514
+
515
+
516
+ # 示例1:文档识别
517
+ def load_example_1(history, task_hist):
518
+ prompt = "检测并识别图片中的文字,将文本坐标格式化输出。"
519
+ image_url = EXAMPLE_IMAGES["spotting"]
520
+ # 下载 URL 图片到本地
521
+ image_path = download_url_image(image_url)
522
+ # 清空对话历史
523
+ history = []
524
+ task_hist = []
525
+ history = history + [((image_path,), None)]
526
+ task_hist = task_hist + [((image_path,), None)]
527
+ return history, task_hist, prompt
528
+
529
+
530
+
531
+ # 示例2:场景文字
532
+ def load_example_2(history, task_hist):
533
+ prompt = "提取文档图片中正文的所有信息用markdown 格式表示,其中页眉、页脚部分忽略,表格用html 格式表达,文档中公式用latex 格式表示,按照阅读顺序组织进行解析。"
534
+ image_url = EXAMPLE_IMAGES["parsing"]
535
+ # 下载 URL 图片到本地
536
+ image_path = download_url_image(image_url)
537
+ # 清空对话历史
538
+ history = []
539
+ task_hist = []
540
+ history = history + [((image_path,), None)]
541
+ task_hist = task_hist + [((image_path,), None)]
542
+ return history, task_hist, prompt
543
+
544
+
545
+
546
+ # 示例3:表格提取
547
+ def load_example_3(history, task_hist):
548
+ prompt = "提取图片中的:['单价', '上车时间','发票号码', '省前缀', '总金额', '发票代码', '下车时间', '里程数'] 的字段内容,并且按照JSON格式返回。"
549
+ image_url = EXAMPLE_IMAGES["ie"]
550
+ # 下载 URL 图片到本地
551
+ image_path = download_url_image(image_url)
552
+ # 清空对话历史
553
+ history = []
554
+ task_hist = []
555
+ history = history + [((image_path,), None)]
556
+ task_hist = task_hist + [((image_path,), None)]
557
+ return history, task_hist, prompt
558
+
559
+ # 示例4:手写体
560
+ def load_example_4(history, task_hist):
561
+ prompt = "What is the highest life expectancy at birth of male?"
562
+ image_url = EXAMPLE_IMAGES["vqa"]
563
+ # 下载 URL 图��到本地
564
+ image_path = download_url_image(image_url)
565
+ # 清空对话历史
566
+ history = []
567
+ task_hist = []
568
+ history = history + [((image_path,), None)]
569
+ task_hist = task_hist + [((image_path,), None)]
570
+ return history, task_hist, prompt
571
+
572
+ # 示例5:翻译
573
+ def load_example_5(history, task_hist):
574
+ prompt = "提取图中文字,并将其翻译成英文。"
575
+ image_url = EXAMPLE_IMAGES["translation"]
576
+ # 下载 URL 图片到本地
577
+ image_path = download_url_image(image_url)
578
+ # 清空对话历史
579
+ history = []
580
+ task_hist = []
581
+ history = history + [((image_path,), None)]
582
+ task_hist = task_hist + [((image_path,), None)]
583
+ return history, task_hist, prompt
584
+
585
+ # 绑定事件
586
+ example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
587
+ example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
588
+ example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
589
+ example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
590
+ example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])
591
+
592
+ submit_btn.click(add_text, [chatbot, task_history, query],
593
+ [chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
594
+ submit_btn.click(reset_user_input, [], [query])
595
+ empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
596
+ regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
597
+ addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)
598
+
599
+ # 功能说明区域
600
+ with gr.Row():
601
+ with gr.Column(scale=1):
602
+ gr.HTML("""
603
+ <div class="feature-section">
604
+ <div class="section-title">✨ 核心功能</div>
605
+ <ul style="line-height: 2; color: #4b5563; font-size: 14px; margin: 0; padding-left: 20px;">
606
+ <li><strong>🎯 高精度文字检测识别</strong> - 支持多场景文字检测与识别</li>
607
+ <li><strong>📐 智能文档解析</strong> - 自动识别文档结构,支持多粒度文档解析</li>
608
+ <li><strong>📋 信息提取</strong> - 支持30+高频卡证票据识别和结构化输出</li>
609
+ <li><strong>✏️ 视觉问答</strong> - 支持以文本为中心的开放式问答</li>
610
+ <li><strong>🌍 跨语言翻译</strong> - 支持中英互译及14+语种译为中英文</li>
611
+ </ul>
612
+ </div>
613
+ """)
614
+
615
+ with gr.Column(scale=1):
616
+ gr.HTML("""
617
+ <div class="feature-section">
618
+ <div class="section-title">💡 使用建议</div>
619
+ <ul style="line-height: 2; color: #4b5563; font-size: 14px; margin: 0; padding-left: 20px;">
620
+ <li><strong>图片质量</strong> - 确保图片清晰,光线充足,分辨率适中</li>
621
+ <li><strong>拍摄角度</strong> - 避免严重倾斜、遮挡或反光,正面拍摄效果最佳</li>
622
+ <li><strong>文件大小</strong> - 建议单张图片不超过 10MB,支持 JPG/PNG 格式</li>
623
+ <li><strong>使用场景</strong> - 适用于文字检测识别、文档数字化、票据识别、信息提取、文字图片翻译等</li>
624
+ <li><strong>合规使用</strong> - 仅供学习研究,请遵守法律法规,尊重隐私权</li>
625
+ </ul>
626
+ </div>
627
+ """)
628
+
629
+ # 底部版权信息
630
+ gr.HTML("""
631
+ <div style="text-align: center; color: #9ca3af; font-size: 13px; margin-top: 40px; padding: 20px; border-top: 1px solid #e5e7eb;">
632
+ <p style="margin: 0;">© 2025 Tencent Hunyuan Team. All rights reserved.</p>
633
+ <p style="margin: 5px 0 0 0;">本系统基于 HunyuanOCR 构建 | 仅供学习研究使用</p>
634
+ </div>
635
+ """)
636
+
637
+ demo.queue().launch(
638
+ share=args.share,
639
+ inbrowser=args.inbrowser,
640
+ # server_port=args.server_port,
641
+ # server_name=args.server_name,
642
+ )
643
+
644
+
645
+ def main():
646
+ args = _get_args()
647
+ model, processor = _load_model_processor(args)
648
+ _launch_demo(args, model, processor)
649
+
650
+
651
+ if __name__ == '__main__':
652
+ main()
requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch==2.6.0
2
+ git+https://github.com/ManaEstras/transformers.git@v4.57.1.hyvl
3
+ tokenizers
4
+ accelerate
5
+ einops
6
+ addict
7
+ easydict
8
+ torchvision
9
+ flash-attn @ https://github.com/Dao-AILab/flash-attention/releases/download/v2.7.3/flash_attn-2.7.3+cu12torch2.6cxx11abiFALSE-cp310-cp310-linux_x86_64.whl
10
+ PyMuPDF
11
+ hf_transfer
12
+ qwen_vl_utils