HunyuanOCR-ENGLISH

Running

App Files Files Community

aleclyu commited on 12 days ago

Commit

d87f42b

1 Parent(s): acfce9f

fix zerogpu error

Browse files

Files changed (1) hide show

app.py +19 -15

app.py CHANGED Viewed

@@ -90,13 +90,21 @@ def _gc():
 def _launch_demo(args, model, processor):
-    # 关键：减少 duration 到 30 秒，如果超时说明有问题
-    @spaces.GPU(duration=30)
-    def call_local_model(model, processor, messages):
         import time
         start_time = time.time()
         print(f"[DEBUG] ========== 开始推理 ==========")
-        print(f"[DEBUG] 时间: {time.strftime('%Y-%m-%d %H:%M:%S')}")
         messages = [messages]
@@ -117,26 +125,22 @@ def _launch_demo(args, model, processor):
             padding=True,
             return_tensors="pt",
         )
-        inputs = inputs.to(model.device)
         print(f"[DEBUG] 输入准备完成，耗时: {time.time() - start_time:.2f}s")
         print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
-        print(f"[DEBUG] Device: {model.device}")
-        # 关键优化：极限压缩参数
         gen_start = time.time()
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
-                max_new_tokens=512,  # 从 8192 降到 512，避免无限生成
                 repetition_penalty=1.03,
                 do_sample=False,
                 eos_token_id=processor.tokenizer.eos_token_id,
                 pad_token_id=processor.tokenizer.pad_token_id,
-                use_cache=True,
-                # 关键：添加长度惩罚，鼓励短输出
-                length_penalty=0.8,
-                # 添加早停
-                early_stopping=True,
             )
         gen_time = time.time() - gen_start
@@ -201,8 +205,8 @@ def _launch_demo(args, model, processor):
                     content = []
             messages.pop()
-            # 调用模型获取响应
-            response_list = call_local_model(model, processor, messages)
             response = response_list[0] if response_list else ""
             _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))

 def _launch_demo(args, model, processor):
+    # 关键修复：移除 model 和 processor 参数，使用闭包访问
+    @spaces.GPU(duration=60)
+    def call_local_model(messages):
         import time
         start_time = time.time()
         print(f"[DEBUG] ========== 开始推理 ==========")
+        # 关键：检查并确保模型在 GPU 上
+        model_device = next(model.parameters()).device
+        print(f"[DEBUG] Model device: {model_device}")
+        if str(model_device) == 'cpu':
+            print(f"[ERROR] 模型在 CPU 上！尝试移动到 GPU...")
+            model.cuda()
+            print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
         messages = [messages]
             padding=True,
             return_tensors="pt",
         )
+        # 确保输入在 GPU 上
+        inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
         print(f"[DEBUG] 输入准备完成，耗时: {time.time() - start_time:.2f}s")
         print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
+        print(f"[DEBUG] Input device: {inputs.input_ids.device}")
+        # 生成
         gen_start = time.time()
         with torch.no_grad():
             generated_ids = model.generate(
                 **inputs,
+                max_new_tokens=256,
                 repetition_penalty=1.03,
                 do_sample=False,
                 eos_token_id=processor.tokenizer.eos_token_id,
                 pad_token_id=processor.tokenizer.pad_token_id,
             )
         gen_time = time.time() - gen_start
                     content = []
             messages.pop()
+            # 调用模型获取响应（已修改：不再传递 model 和 processor）
+            response_list = call_local_model(messages)
             response = response_list[0] if response_list else ""
             _chatbot[-1] = (_parse_text(chat_query), _remove_image_special(_parse_text(response)))