Spaces:
Running
Running
debug zerogpu timeout error
Browse files
app.py
CHANGED
|
@@ -51,6 +51,8 @@ def _load_model_processor(args):
|
|
| 51 |
# ZeroGPU 环境:模型在 CPU 上加载,使用 eager 模式
|
| 52 |
# 在 @spaces.GPU 装饰器内会自动移到 GPU
|
| 53 |
print(f"[INFO] 加载模型(ZeroGPU 环境使用 eager 模式)")
|
|
|
|
|
|
|
| 54 |
model = HunYuanVLForConditionalGeneration.from_pretrained(
|
| 55 |
args.checkpoint_path,
|
| 56 |
attn_implementation="eager", # ZeroGPU 必须用 eager,因为初始在 CPU
|
|
@@ -59,7 +61,8 @@ def _load_model_processor(args):
|
|
| 59 |
token=os.environ.get('HF_TOKEN')
|
| 60 |
)
|
| 61 |
processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
|
| 62 |
-
|
|
|
|
| 63 |
return model, processor
|
| 64 |
|
| 65 |
|
|
@@ -91,20 +94,38 @@ def _gc():
|
|
| 91 |
|
| 92 |
def _launch_demo(args, model, processor):
|
| 93 |
# 关键修复:移除 model 和 processor 参数,使用闭包访问
|
| 94 |
-
|
|
|
|
| 95 |
def call_local_model(messages):
|
| 96 |
import time
|
| 97 |
start_time = time.time()
|
| 98 |
print(f"[DEBUG] ========== 开始推理 ==========")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
|
| 100 |
# 关键:检查并确保模型在 GPU 上
|
| 101 |
model_device = next(model.parameters()).device
|
| 102 |
print(f"[DEBUG] Model device: {model_device}")
|
|
|
|
| 103 |
|
| 104 |
if str(model_device) == 'cpu':
|
| 105 |
print(f"[ERROR] 模型在 CPU 上!尝试移动到 GPU...")
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
messages = [messages]
|
| 110 |
|
|
|
|
| 51 |
# ZeroGPU 环境:模型在 CPU 上加载,使用 eager 模式
|
| 52 |
# 在 @spaces.GPU 装饰器内会自动移到 GPU
|
| 53 |
print(f"[INFO] 加载模型(ZeroGPU 环境使用 eager 模式)")
|
| 54 |
+
print(f"[INFO] 加载时 CUDA available: {torch.cuda.is_available()}")
|
| 55 |
+
|
| 56 |
model = HunYuanVLForConditionalGeneration.from_pretrained(
|
| 57 |
args.checkpoint_path,
|
| 58 |
attn_implementation="eager", # ZeroGPU 必须用 eager,因为初始在 CPU
|
|
|
|
| 61 |
token=os.environ.get('HF_TOKEN')
|
| 62 |
)
|
| 63 |
processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
|
| 64 |
+
|
| 65 |
+
print(f"[INFO] 模型加载完成,当前设备: {next(model.parameters()).device}")
|
| 66 |
return model, processor
|
| 67 |
|
| 68 |
|
|
|
|
| 94 |
|
| 95 |
def _launch_demo(args, model, processor):
|
| 96 |
# 关键修复:移除 model 和 processor 参数,使用闭包访问
|
| 97 |
+
# 增加 duration 到 120 秒,避免高峰期超时
|
| 98 |
+
@spaces.GPU(duration=120)
|
| 99 |
def call_local_model(messages):
|
| 100 |
import time
|
| 101 |
start_time = time.time()
|
| 102 |
print(f"[DEBUG] ========== 开始推理 ==========")
|
| 103 |
+
print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
|
| 104 |
+
if torch.cuda.is_available():
|
| 105 |
+
print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
|
| 106 |
+
print(f"[DEBUG] Current CUDA device: {torch.cuda.current_device()}")
|
| 107 |
+
print(f"[DEBUG] Device name: {torch.cuda.get_device_name(0)}")
|
| 108 |
+
print(f"[DEBUG] GPU Memory allocated: {torch.cuda.memory_allocated(0) / 1024**3:.2f} GB")
|
| 109 |
+
print(f"[DEBUG] GPU Memory reserved: {torch.cuda.memory_reserved(0) / 1024**3:.2f} GB")
|
| 110 |
|
| 111 |
# 关键:检查并确保模型在 GPU 上
|
| 112 |
model_device = next(model.parameters()).device
|
| 113 |
print(f"[DEBUG] Model device: {model_device}")
|
| 114 |
+
print(f"[DEBUG] Model dtype: {next(model.parameters()).dtype}")
|
| 115 |
|
| 116 |
if str(model_device) == 'cpu':
|
| 117 |
print(f"[ERROR] 模型在 CPU 上!尝试移动到 GPU...")
|
| 118 |
+
if torch.cuda.is_available():
|
| 119 |
+
move_start = time.time()
|
| 120 |
+
model.cuda()
|
| 121 |
+
move_time = time.time() - move_start
|
| 122 |
+
print(f"[DEBUG] Model device after cuda(): {next(model.parameters()).device}")
|
| 123 |
+
print(f"[DEBUG] 模型移动到 GPU 耗时: {move_time:.2f}s")
|
| 124 |
+
else:
|
| 125 |
+
print(f"[CRITICAL] CUDA 不可用!将在 CPU 上运行,速度会很慢!")
|
| 126 |
+
print(f"[CRITICAL] 这可能是因为 ZeroGPU 资源紧张或超时")
|
| 127 |
+
else:
|
| 128 |
+
print(f"[INFO] 模型已在 GPU 上: {model_device}")
|
| 129 |
|
| 130 |
messages = [messages]
|
| 131 |
|