aleclyu commited on
Commit
bdbf47f
·
1 Parent(s): e7257d2

debug zerogpu timeout error

Browse files
Files changed (1) hide show
  1. app.py +70 -8
app.py CHANGED
@@ -15,6 +15,13 @@ import tempfile
15
  import hashlib
16
  import gc
17
 
 
 
 
 
 
 
 
18
 
19
 
20
 
@@ -60,6 +67,16 @@ def _load_model_processor(args):
60
  device_map="auto", # 改回 auto,让 ZeroGPU 自动管理
61
  token=os.environ.get('HF_TOKEN')
62
  )
 
 
 
 
 
 
 
 
 
 
63
  processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
64
 
65
  print(f"[INFO] 模型加载完成,当前设备: {next(model.parameters()).device}")
@@ -93,13 +110,26 @@ def _gc():
93
 
94
 
95
  def _launch_demo(args, model, processor):
 
 
 
96
  # 关键修复:移除 model 和 processor 参数,使用闭包访问
97
  # 增加 duration 到 120 秒,避免高峰期超时
98
  @spaces.GPU(duration=120)
99
  def call_local_model(messages):
100
  import time
 
101
  start_time = time.time()
 
 
 
 
 
 
 
102
  print(f"[DEBUG] ========== 开始推理 ==========")
 
 
103
  print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
104
  if torch.cuda.is_available():
105
  print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
@@ -139,6 +169,19 @@ def _launch_demo(args, model, processor):
139
  image_inputs, video_inputs = process_vision_info(messages)
140
  print(f"[DEBUG] 图像处理完成,耗时: {time.time() - start_time:.2f}s")
141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
  inputs = processor(
143
  text=texts,
144
  images=image_inputs,
@@ -146,9 +189,13 @@ def _launch_demo(args, model, processor):
146
  padding=True,
147
  return_tensors="pt",
148
  )
 
 
149
  # 确保输入在 GPU 上
 
150
  inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
151
- print(f"[DEBUG] 输入准备完成,耗时: {time.time() - start_time:.2f}s")
 
152
  print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
153
  print(f"[DEBUG] Input device: {inputs.input_ids.device}")
154
  print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
@@ -177,22 +224,37 @@ def _launch_demo(args, model, processor):
177
  return False
178
 
179
  with torch.no_grad():
180
- print(f"[DEBUG] 调用 model.generate()...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  try:
 
182
  generated_ids = model.generate(
183
  **inputs,
184
- max_new_tokens=max_new_tokens,
185
  repetition_penalty=1.03,
186
- do_sample=False,
187
- stopping_criteria=None, # 确保没有额外的停止条件
188
- pad_token_id=processor.tokenizer.pad_token_id,
189
- eos_token_id=processor.tokenizer.eos_token_id,
190
  )
 
191
  except Exception as e:
192
  print(f"[ERROR] 生成失败: {e}")
 
 
193
  raise
194
 
195
- print(f"[DEBUG] model.generate() 调用完成")
196
 
197
  gen_time = time.time() - gen_start
198
  print(f"[DEBUG] ========== 生成完成 ==========")
 
15
  import hashlib
16
  import gc
17
 
18
+ # 关键优化:设置环境变量加速 transformers
19
+ os.environ['TOKENIZERS_PARALLELISM'] = 'false' # 避免tokenizer警告
20
+ os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = '1'
21
+ # 禁用 PyTorch 的 JIT 融合优化(在某些情况下会导致首次运行极慢)
22
+ # torch._C._jit_set_profiling_executor(False)
23
+ # torch._C._jit_set_profiling_mode(False)
24
+
25
 
26
 
27
 
 
67
  device_map="auto", # 改回 auto,让 ZeroGPU 自动管理
68
  token=os.environ.get('HF_TOKEN')
69
  )
70
+
71
+ # 关键:禁用梯度检查点(如果启用会导致极慢)
72
+ if hasattr(model, 'gradient_checkpointing_disable'):
73
+ model.gradient_checkpointing_disable()
74
+ print(f"[INFO] 梯度检查点已禁用")
75
+
76
+ # 设置为评估模式
77
+ model.eval()
78
+ print(f"[INFO] 模型设置为评估模式")
79
+
80
  processor = AutoProcessor.from_pretrained(args.checkpoint_path, use_fast=False, trust_remote_code=True)
81
 
82
  print(f"[INFO] 模型加载完成,当前设备: {next(model.parameters()).device}")
 
110
 
111
 
112
  def _launch_demo(args, model, processor):
113
+ # 全局变量用于跟踪是否是首次调用
114
+ first_call = [True]
115
+
116
  # 关键修复:移除 model 和 processor 参数,使用闭包访问
117
  # 增加 duration 到 120 秒,避免高峰期超时
118
  @spaces.GPU(duration=120)
119
  def call_local_model(messages):
120
  import time
121
+ import sys
122
  start_time = time.time()
123
+
124
+ if first_call[0]:
125
+ print(f"[INFO] ========== 这是首次推理调用 ==========")
126
+ first_call[0] = False
127
+ else:
128
+ print(f"[INFO] ========== 这是第 N 次推理调用 ==========")
129
+
130
  print(f"[DEBUG] ========== 开始推理 ==========")
131
+ print(f"[DEBUG] Python version: {sys.version}")
132
+ print(f"[DEBUG] PyTorch version: {torch.__version__}")
133
  print(f"[DEBUG] CUDA available: {torch.cuda.is_available()}")
134
  if torch.cuda.is_available():
135
  print(f"[DEBUG] CUDA device count: {torch.cuda.device_count()}")
 
169
  image_inputs, video_inputs = process_vision_info(messages)
170
  print(f"[DEBUG] 图像处理完成,耗时: {time.time() - start_time:.2f}s")
171
 
172
+ # 检查图像输入大小
173
+ if image_inputs:
174
+ for idx, img in enumerate(image_inputs):
175
+ if hasattr(img, 'size'):
176
+ print(f"[DEBUG] Image {idx} size: {img.size}")
177
+ elif isinstance(img, np.ndarray):
178
+ print(f"[DEBUG] Image {idx} shape: {img.shape}")
179
+
180
+ print(f"[DEBUG] 开始 processor 编码输入...")
181
+ processor_start = time.time()
182
+
183
+ print(f"[DEBUG] 开始 processor 编码输入...")
184
+ processor_start = time.time()
185
  inputs = processor(
186
  text=texts,
187
  images=image_inputs,
 
189
  padding=True,
190
  return_tensors="pt",
191
  )
192
+ print(f"[DEBUG] Processor 编码完成,耗时: {time.time() - processor_start:.2f}s")
193
+
194
  # 确保输入在 GPU 上
195
+ to_device_start = time.time()
196
  inputs = inputs.to('cuda' if torch.cuda.is_available() else 'cpu')
197
+ print(f"[DEBUG] 输入移到设备耗时: {time.time() - to_device_start:.2f}s")
198
+ print(f"[DEBUG] 输入准备完成,总耗时: {time.time() - start_time:.2f}s")
199
  print(f"[DEBUG] Input IDs shape: {inputs.input_ids.shape}")
200
  print(f"[DEBUG] Input device: {inputs.input_ids.device}")
201
  print(f"[DEBUG] Input sequence length: {inputs.input_ids.shape[1]}")
 
224
  return False
225
 
226
  with torch.no_grad():
227
+ print(f"[DEBUG] 进入 torch.no_grad() 上下文,耗时: {time.time() - start_time:.2f}s")
228
+
229
+ # 先做一次简单的前向传播测试
230
+ print(f"[DEBUG] 测试前向传播...")
231
+ forward_test_start = time.time()
232
+ try:
233
+ with torch.cuda.amp.autocast(dtype=torch.bfloat16):
234
+ test_outputs = model(**inputs, use_cache=False)
235
+ print(f"[DEBUG] 前向传播测试成功,耗时: {time.time() - forward_test_start:.2f}s")
236
+ except Exception as e:
237
+ print(f"[WARNING] 前向传播测试失败: {e}")
238
+
239
+ print(f"[DEBUG] 开始调用 model.generate()... (当前耗时: {time.time() - start_time:.2f}s)")
240
+ generate_call_start = time.time()
241
+
242
  try:
243
+ # 关键:添加更激进的生成参数,强制早停
244
  generated_ids = model.generate(
245
  **inputs,
246
+ max_new_tokens=1024,
247
  repetition_penalty=1.03,
248
+ do_sample=False
 
 
 
249
  )
250
+ print(f"[DEBUG] model.generate() 返回,耗时: {time.time() - generate_call_start:.2f}s")
251
  except Exception as e:
252
  print(f"[ERROR] 生成失败: {e}")
253
+ import traceback
254
+ traceback.print_exc()
255
  raise
256
 
257
+ print(f"[DEBUG] 退出 torch.no_grad() 上下文")
258
 
259
  gen_time = time.time() - gen_start
260
  print(f"[DEBUG] ========== 生成完成 ==========")