File size: 22,106 Bytes
5d5f953
 
 
 
7352136
5d5f953
9532898
5d5f953
 
 
 
 
 
 
 
 
7352136
9532898
 
bdbf47f
 
 
 
5d5f953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1efad72
5d5f953
 
 
 
 
 
9532898
 
 
 
c2b0812
acfce9f
 
9532898
acfce9f
9532898
acfce9f
9532898
 
bdbf47f
 
9532898
 
 
bdbf47f
9532898
bdbf47f
5d5f953
c2b0812
9532898
5d5f953
 
 
 
9532898
5d5f953
 
 
 
 
 
 
9532898
5d5f953
 
9532898
5d5f953
 
 
 
 
 
 
9532898
5d5f953
 
 
 
e220a3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d5f953
 
9532898
bdbf47f
9532898
 
 
c2b0812
d87f42b
1efad72
bdbf47f
1efad72
bdbf47f
 
9532898
bdbf47f
 
9532898
 
 
bdbf47f
 
c2b0812
 
 
 
 
 
 
d87f42b
9532898
d87f42b
 
c2b0812
9532898
d87f42b
9532898
c2b0812
 
 
 
 
9532898
c2b0812
9532898
 
c2b0812
9532898
1efad72
5d5f953
acfce9f
9532898
5d5f953
 
 
 
9532898
 
5d5f953
9532898
 
 
bdbf47f
 
 
 
 
 
 
9532898
bdbf47f
5d5f953
 
 
 
 
 
 
9532898
 
 
bdbf47f
d87f42b
9532898
 
acfce9f
d87f42b
e7257d2
5d5f953
9532898
acfce9f
9532898
 
 
 
e7257d2
9532898
 
e7257d2
 
 
 
 
 
 
 
 
9532898
e7257d2
 
 
5d5f953
9532898
 
 
 
bdbf47f
 
 
 
9532898
bdbf47f
9532898
 
 
bdbf47f
 
e7257d2
9532898
e7257d2
 
f72db18
c8398d5
 
e7257d2
9532898
e7257d2
9532898
bdbf47f
 
e7257d2
 
9532898
 
acfce9f
9532898
 
acfce9f
9532898
 
5d5f953
 
 
acfce9f
5d5f953
 
 
 
1efad72
acfce9f
9532898
 
5d5f953
 
 
 
1efad72
e220a3a
acfce9f
9532898
 
 
 
e220a3a
5d5f953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9532898
5d5f953
 
 
 
 
 
 
 
 
 
 
 
9532898
d87f42b
5d5f953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9532898
5d5f953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9532898
5d5f953
9532898
5d5f953
 
b0b2eee
9532898
 
5d5f953
 
9532898
 
5d5f953
 
cfbd98d
 
5d5f953
 
9532898
 
5d5f953
 
 
 
 
 
 
 
 
 
9532898
5d5f953
9532898
 
 
 
 
5d5f953
 
9532898
 
 
5d5f953
9532898
 
5d5f953
9532898
5d5f953
 
 
 
 
9532898
 
 
5d5f953
9532898
 
 
5d5f953
 
9532898
5d5f953
9532898
 
 
 
5d5f953
ef880bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5d5f953
9532898
5d5f953
 
9532898
 
 
 
 
 
 
5d5f953
9532898
5d5f953
9532898
 
 
 
 
 
5d5f953
9532898
 
 
5d5f953
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
import os
import torch
import numpy as np
from PIL import Image
import spaces
from transformers import AutoProcessor
from qwen_vl_utils import process_vision_info
from transformers import HunYuanVLForConditionalGeneration
import gradio as gr
from argparse import ArgumentParser
import copy
import requests
from io import BytesIO
import tempfile
import hashlib
import gc

# Optimization: Set environment variables
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
# torch._C._jit_set_profiling_executor(False)
# torch._C._jit_set_profiling_mode(False)





def _get_args():
    parser = ArgumentParser()

    parser.add_argument('-c',
                        '--checkpoint-path',
                        type=str,
                        default='tencent/HunyuanOCR',
                        help='Checkpoint name or path, default to %(default)r')
    parser.add_argument('--cpu-only', action='store_true', help='Run demo with CPU only')

    parser.add_argument('--flash-attn2',
                        action='store_true',
                        default=False,
                        help='Enable flash_attention_2 when loading the model.')
    parser.add_argument('--share',
                        action='store_true',
                        default=False,
                        help='Create a publicly shareable link for the interface.')
    parser.add_argument('--inbrowser',
                        action='store_true',
                        default=False,
                        help='Automatically launch the interface in a new tab on the default browser.')
    

    args = parser.parse_args()
    return args


def _load_model_processor(args):
    # ZeroGPU: Model loads on CPU, uses eager mode
    # Automatically moves to GPU within @spaces.GPU decorator
    print(f"[INFO] Loading model (ZeroGPU uses eager mode)")
    print(f"[INFO] CUDA available at load time: {torch.cuda.is_available()}")
    
    model = HunYuanVLForConditionalGeneration.from_pretrained(
        args.checkpoint_path,
        attn_implementation="eager",  # Required for ZeroGPU (starts on CPU)
        torch_dtype=torch.bfloat16,
        device_map="auto",  # Let ZeroGPU manage device placement
    )

    # Disable gradient checkpointing for faster inference
    if hasattr(model, 'gradient_checkpointing_disable'):
        model.gradient_checkpointing_disable()
        print(f"[INFO] Gradient checkpointing disabled")

    # Set to evaluation mode
    model.eval()
    print(f"[INFO] Model set to eval mode")
    
    processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
    
    print(f"[INFO] Model loaded, device: {next(model.parameters()).device}")
    return model, processor


def _parse_text(text):
    """Parse text, handle special formatting"""
    # if text is None:
    #     return text
    text = text.replace("<trans>", "").replace("</trans>", "")
    return text


def _remove_image_special(text):
    """Remove image special tokens"""
    # if text is None:
    #     return text
    # # Remove image special tokens
    # import re
    # text = re.sub(r'<image>|</image>|<img>|</img>', '', text)
    # return text
    return text


def _gc():
    """Garbage collection"""
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def clean_repeated_substrings(text):
    """Clean repeated substrings in text"""
    n = len(text)
    if n < 2000:
        return text
    for length in range(2, n // 10 + 1):
        candidate = text[-length:] 
        count = 0
        i = n - length
        
        while i >= 0 and text[i:i + length] == candidate:
            count += 1
            i -= length

        if count >= 10:
            return text[:n - length * (count - 1)]  

    return text


def _launch_demo(args, model, processor):
    # Track first call
    first_call = [True]

    # Uses closure to access model and processor
    # Duration increased to 120s to avoid timeout during peak hours
    @spaces.GPU(duration=120)
    def call_local_model(messages):
        import time
        import sys
        start_time = time.time()
        
        if first_call[0]:
            print(f"[INFO] ========== First inference call ==========")
            first_call[0] = False
        else:
            print(f"[INFO] ========== Subsequent inference call ==========")

        print(f"[DEBUG] ========== Starting inference ==========")
        print(f"[DEBUG] Python version: {sys.version}")
        print(f"[DEBUG] PyTorch version: {torch.__version__}")
        print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
        if torch.cuda.is_available():
            print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
            print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}")
            print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
            print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
            print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
        
        # Ensure model is on GPU
        model_device = next(model.parameters()).device
        print(f"[DEBUG] Model device: {model_device}")
        print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")

        if str(model_device) == 'cpu':
            print(f"[ERROR] Model on CPU! Attempting to move to GPU...")
            if torch.cuda.is_available():
                move_start = time.time()
                model.cuda()
                move_time = time.time() - move_start
                print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
                print(f"[DEBUG] Model moved to GPU in: {move_time:.2f}s")
            else:
                print(f"[CRITICAL] CUDA unavailable! Running on CPU will be slow!")
                print(f"[CRITICAL] This may be due to ZeroGPU resource constraints")
        else:
            print(f"[INFO] Model already on GPU: {model_device}")
        
        messages = [messages]
        
        # Build input using processor
        texts = [
            processor.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
            for msg in messages
        ]
        print(f"[DEBUG] Template built, elapsed: {time.time() - start_time:.2f}s")

        image_inputs, video_inputs = process_vision_info(messages)
        print(f"[DEBUG] Image processing done, elapsed: {time.time() - start_time:.2f}s")

        # Check image input size
        if image_inputs:
            for idx, img in enumerate(image_inputs):
                if hasattr(img, 'size'):
                    print(f"[DEBUG] Image {idx} size: {img.size}")
                elif isinstance(img, np.ndarray):
                    print(f"[DEBUG] Image {idx} shape: {img.shape}")
        
        print(f"[DEBUG] Starting processor encoding...")
        processor_start = time.time()
        inputs = processor(
            text=texts,
            images=image_inputs,
            videos=video_inputs,
            padding=True,
            return_tensors="pt",
        )
        print(f"[DEBUG] Processor encoding done, elapsed: {time.time() - processor_start:.2f}s")

        # Ensure inputs on GPU
        to_device_start = time.time()
        inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"[DEBUG] Inputs moved to device, elapsed: {time.time() - to_device_start:.2f}s")
        print(f"[DEBUG] Input preparation done, total elapsed: {time.time() - start_time:.2f}s")
        print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
        print(f"[DEBUG] Input device: {inputs.input_ids.device}")
        print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
        
        # Generation
        gen_start = time.time()
        print(f"[DEBUG] ========== Starting token generation ==========")

        # Optimized max_new_tokens for OCR tasks
        max_new_tokens = 2048
        print(f"[DEBUG] max_new_tokens: {max_new_tokens}")

        # Progress callback
        token_count = [0]
        last_time = [gen_start]
        
        def progress_callback(input_ids, scores, **kwargs):
            token_count[0] += 1
            current_time = time.time()
            if token_count[0] % 10 == 0 or (current_time - last_time[0]) > 2.0:
                elapsed = current_time - gen_start
                tokens_per_sec = token_count[0] / elapsed if elapsed > 0 else 0
                print(f"[DEBUG] Generated {token_count[0]} tokens, speed: {tokens_per_sec:.2f} tokens/s, elapsed: {elapsed:.2f}s")
                last_time[0] = current_time
            return False
        
        with torch.no_grad():
            print(f"[DEBUG] Entered torch.no_grad() context, elapsed: {time.time() - start_time:.2f}s")

            # Test forward pass
            print(f"[DEBUG] Testing forward pass...")
            forward_test_start = time.time()
            try:
                with torch.cuda.amp.autocast(dtype=torch.bfloat16):
                    test_outputs = model(**inputs, use_cache=False)
                print(f"[DEBUG] Forward pass test successful, elapsed: {time.time() - forward_test_start:.2f}s")
            except Exception as e:
                print(f"[WARNING] Forward pass test failed: {e}")

            print(f"[DEBUG] Starting model.generate()... (elapsed: {time.time() - start_time:.2f}s)")
            generate_call_start = time.time()
            
            try:
                # Deterministic generation
                generated_ids = model.generate(
                    **inputs,
                    max_new_tokens=max_new_tokens,
                    do_sample=False,
                    temperature=0
                )
                print(f"[DEBUG] model.generate() returned, elapsed: {time.time() - generate_call_start:.2f}s")
            except Exception as e:
                print(f"[ERROR] Generation failed: {e}")
                import traceback
                traceback.print_exc()
                raise
        
        print(f"[DEBUG] Exited torch.no_grad() context")

        gen_time = time.time() - gen_start
        print(f"[DEBUG] ========== Generation complete ==========")
        print(f"[DEBUG] Generation time: {gen_time:.2f}s")
        print(f"[DEBUG] Output shape: {generated_ids.shape}")

        # Decode output
        if "input_ids" in inputs:
            input_ids = inputs.input_ids
        else:
            input_ids = inputs.inputs

        generated_ids_trimmed = [
            out_ids[len(in_ids):] for in_ids, out_ids in zip(input_ids, generated_ids)
        ]
        
        actual_tokens = len(generated_ids_trimmed[0])
        print(f"[DEBUG] Actual tokens generated: {actual_tokens}")
        print(f"[DEBUG] Time per token: {gen_time/actual_tokens if actual_tokens > 0 else 0:.3f}s")

        output_texts = processor.batch_decode(
            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
        )
        
        
        total_time = time.time() - start_time
        print(f"[DEBUG] ========== All done ==========")
        print(f"[DEBUG] Total time: {total_time:.2f}s")
        print(f"[DEBUG] Output length: {len(output_texts[0])} chars")
        print(f"[DEBUG] Output preview: {output_texts[0][:100]}...")
        output_texts[0] = clean_repeated_substrings(output_texts[0])
        return output_texts
    

    def create_predict_fn():

        def predict(_chatbot, task_history):
            nonlocal model, processor
            chat_query = _chatbot[-1][0]
            query = task_history[-1][0]
            if len(chat_query) == 0:
                _chatbot.pop()
                task_history.pop()
                return _chatbot
            print('User: ', query)
            history_cp = copy.deepcopy(task_history)
            full_response = ''
            messages = []
            content = []
            for q, a in history_cp:
                if isinstance(q, (tuple, list)):
                    # Check if URL or local path
                    img_path = q[0]
                    if img_path.startswith(('http://', 'https://')):
                        content.append({'type': 'image', 'image': img_path})
                    else:
                        content.append({'type': 'image', 'image': f'{os.path.abspath(img_path)}'})
                else:
                    content.append({'type': 'text', 'text': q})
                    messages.append({'role': 'user', 'content': content})
                    messages.append({'role': 'assistant', 'content': [{'type': 'text', 'text': a}]})
                    content = []
            messages.pop()
            
            # Call model to get response
            response_list = call_local_model(messages)
            response = response_list[0] if response_list else ""
            
            _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))
            full_response = _parse_text(response)

            task_history[-1] = (query, full_response)
            print('HunyuanOCR: ' + _parse_text(full_response))
            yield _chatbot

        return predict
    
    def create_regenerate_fn():

        def regenerate(_chatbot, task_history):
            nonlocal model, processor
            if not task_history:
                return _chatbot
            item = task_history[-1]
            if item[1] is None:
                return _chatbot
            task_history[-1] = (item[0], None)
            chatbot_item = _chatbot.pop(-1)
            if chatbot_item[0] is None:
                _chatbot[-1] = (_chatbot[-1][0], None)
            else:
                _chatbot.append((chatbot_item[0], None))
            # Use outer predict function
            _chatbot_gen = predict(_chatbot, task_history)
            for _chatbot in _chatbot_gen:
                yield _chatbot

        return regenerate

    predict = create_predict_fn()
    regenerate = create_regenerate_fn()

    def add_text(history, task_history, text):
        task_text = text
        history = history if history is not None else []
        task_history = task_history if task_history is not None else []
        history = history + [(_parse_text(text), None)]
        task_history = task_history + [(task_text, None)]
        return history, task_history, ''

    def add_file(history, task_history, file):
        history = history if history is not None else []
        task_history = task_history if task_history is not None else []
        history = history + [((file.name,), None)]
        task_history = task_history + [((file.name,), None)]
        return history, task_history
    
    def download_url_image(url):
        """Download URL image to local temp file"""
        try:
            # Use URL hash as filename to avoid duplicate downloads
            url_hash = hashlib.md5(url.encode()).hexdigest()
            temp_dir = tempfile.gettempdir()
            temp_path = os.path.join(temp_dir, f"hyocr_demo_{url_hash}.png")

            # Return cached file if exists
            if os.path.exists(temp_path):
                return temp_path

            # Download image
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            with open(temp_path, 'wb') as f:
                f.write(response.content)
            return temp_path
        except Exception as e:
            print(f"Failed to download image: {url}, error: {e}")
            return url  # Return original URL on failure

    def reset_user_input():
        return gr.update(value='')

    def reset_state(_chatbot, task_history):
        task_history.clear()
        _chatbot.clear()
        _gc()
        return []

    # Example image paths - local files
    EXAMPLE_IMAGES = {
        "spotting": "examples/spotting.jpg",
        "parsing": "examples/parsing.jpg",
        "ie": "examples/ie.jpg",
        "vqa": "examples/vqa.jpg",
        "translation": "examples/translation.jpg"
    }

    with gr.Blocks() as demo:
        # Header
        gr.Markdown("# HunyuanOCR\n*Powered by Tencent Hunyuan Team*")
        
        with gr.Column():
            # Chat area
            chatbot = gr.Chatbot(
                label='Chat',
                height=600,
                bubble_full_width=False,
                layout="bubble",
                show_copy_button=True,
            )

            # Input panel
            with gr.Group():
                query = gr.Textbox(
                    lines=2,
                    label='Enter your question',
                    placeholder='Upload an image first, then enter your question. Example: Detect and recognize text in this image.',
                    show_label=False
                )

                with gr.Row():
                    addfile_btn = gr.UploadButton('Upload Image', file_types=['image'])
                    submit_btn = gr.Button('Send', variant="primary", scale=3)
                    regen_btn = gr.Button('Regenerate')
                    empty_bin = gr.Button('Clear')
            
            # Examples section
            gr.Markdown("### Quick Examples - Click to load")

            with gr.Row():
                example_1_btn = gr.Button("Text Detection")
                example_2_btn = gr.Button("Document Parsing")
                example_3_btn = gr.Button("Info Extraction")
                example_4_btn = gr.Button("Visual Q&A")
                example_5_btn = gr.Button("Translation")
        
        task_history = gr.State([])
        
        
        # Example 1: Text Detection
        def load_example_1(history, task_hist):
            prompt = "Detect and recognize all text in this image. Output the text with bounding box coordinates."
            image_path = EXAMPLE_IMAGES["spotting"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 2: Document Parsing
        def load_example_2(history, task_hist):
            prompt = "Extract all text from this document in markdown format. Use HTML for tables and LaTeX for equations. Parse in reading order."
            image_path = EXAMPLE_IMAGES["parsing"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 3: Information Extraction
        def load_example_3(history, task_hist):
            prompt = "Extract the following fields from this receipt and return as JSON: ['total', 'subtotal', 'tax', 'date', 'items']"
            image_path = EXAMPLE_IMAGES["ie"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 4: Visual Q&A
        def load_example_4(history, task_hist):
            prompt = "Look at this chart and answer: Which quarter had the highest revenue? What was the Sales value in Q4?"
            image_path = EXAMPLE_IMAGES["vqa"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt

        # Example 5: Translation
        def load_example_5(history, task_hist):
            prompt = "Translate all text in this image to English."
            image_path = EXAMPLE_IMAGES["translation"]
            history = [((image_path,), None)]
            task_hist = [((image_path,), None)]
            return history, task_hist, prompt
        
        # Bind events
        example_1_btn.click(load_example_1, [chatbot, task_history], [chatbot, task_history, query])
        example_2_btn.click(load_example_2, [chatbot, task_history], [chatbot, task_history, query])
        example_3_btn.click(load_example_3, [chatbot, task_history], [chatbot, task_history, query])
        example_4_btn.click(load_example_4, [chatbot, task_history], [chatbot, task_history, query])
        example_5_btn.click(load_example_5, [chatbot, task_history], [chatbot, task_history, query])
        
        submit_btn.click(add_text, [chatbot, task_history, query],
                         [chatbot, task_history]).then(predict, [chatbot, task_history], [chatbot], show_progress=True)
        submit_btn.click(reset_user_input, [], [query])
        empty_bin.click(reset_state, [chatbot, task_history], [chatbot], show_progress=True)
        regen_btn.click(regenerate, [chatbot, task_history], [chatbot], show_progress=True)
        addfile_btn.upload(add_file, [chatbot, task_history, addfile_btn], [chatbot, task_history], show_progress=True)

        # Feature descriptions
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("""
### Core Features
- **Text Detection & Recognition** - Multi-scene text detection and recognition
- **Document Parsing** - Automatic document structure recognition
- **Information Extraction** - Extract structured data from receipts and forms
- **Visual Q&A** - Text-centric open-ended question answering
- **Translation** - Translate text in images across 14+ languages
                """)

            with gr.Column(scale=1):
                gr.Markdown("""
### Usage Tips
- **Inference** - For production, use VLLM for better performance
- **Image Quality** - Ensure images are clear, well-lit, and not heavily skewed
- **File Size** - Recommended max 10MB per image, JPG/PNG format
- **Use Cases** - OCR, document digitization, receipt recognition, translation
                """)

        # Footer
        gr.Markdown("---\n*2025 Tencent Hunyuan Team. For research and educational use.*")

    demo.queue().launch(
        share=args.share,
        inbrowser=args.inbrowser,
        # server_port=args.server_port,
        # server_name=args.server_name,
    )


def main():
    args = _get_args()
    model, processor = _load_model_processor(args)
    _launch_demo(args, model, processor)


if __name__ == '__main__':
    main()