ACloudCenter commited on
Commit
601dafe
·
1 Parent(s): a08f8f8

Implement faster inference times - test

Browse files
Files changed (1) hide show
  1. app.py +44 -11
app.py CHANGED
@@ -9,6 +9,7 @@ import traceback
9
  import threading
10
  from spaces import GPU
11
  from datetime import datetime
 
12
 
13
  from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
14
  from processor.vibevoice_processor import VibeVoiceProcessor
@@ -39,6 +40,14 @@ class VibeVoiceDemo:
39
  self.current_model_name = None
40
 
41
  self.available_voices = {}
 
 
 
 
 
 
 
 
42
 
43
  self.load_models() # load all on CPU
44
  self.setup_voice_presets()
@@ -61,9 +70,19 @@ class VibeVoiceDemo:
61
  for name, path in self.model_paths.items():
62
  print(f" - {name} from {path}")
63
  proc = VibeVoiceProcessor.from_pretrained(path)
64
- mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
65
- path, torch_dtype=torch.bfloat16
66
- )
 
 
 
 
 
 
 
 
 
 
67
  # Keep on CPU initially
68
  self.processors[name] = proc
69
  self.models[name] = mdl
@@ -208,14 +227,28 @@ class VibeVoiceDemo:
208
  )
209
 
210
  start_time = time.time()
211
- outputs = model.generate(
212
- **inputs,
213
- max_new_tokens=None,
214
- cfg_scale=cfg_scale,
215
- tokenizer=processor.tokenizer,
216
- generation_config={'do_sample': False},
217
- verbose=False,
218
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
219
  generation_time = time.time() - start_time
220
 
221
  if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
 
9
  import threading
10
  from spaces import GPU
11
  from datetime import datetime
12
+ from contextlib import contextmanager
13
 
14
  from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
15
  from processor.vibevoice_processor import VibeVoiceProcessor
 
40
  self.current_model_name = None
41
 
42
  self.available_voices = {}
43
+
44
+ # Set compiler flags for better performance
45
+ if torch.cuda.is_available() and hasattr(torch, '_inductor'):
46
+ if hasattr(torch._inductor, 'config'):
47
+ torch._inductor.config.conv_1x1_as_mm = True
48
+ torch._inductor.config.coordinate_descent_tuning = True
49
+ torch._inductor.config.epilogue_fusion = False
50
+ torch._inductor.config.coordinate_descent_check_all_directions = True
51
 
52
  self.load_models() # load all on CPU
53
  self.setup_voice_presets()
 
70
  for name, path in self.model_paths.items():
71
  print(f" - {name} from {path}")
72
  proc = VibeVoiceProcessor.from_pretrained(path)
73
+ # Try to use flash attention if available
74
+ try:
75
+ mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
76
+ path,
77
+ torch_dtype=torch.bfloat16,
78
+ attn_implementation="flash_attention_2"
79
+ )
80
+ print(f" Flash Attention 2 enabled for {name}")
81
+ except:
82
+ # Fallback to default attention
83
+ mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
84
+ path, torch_dtype=torch.bfloat16
85
+ )
86
  # Keep on CPU initially
87
  self.processors[name] = proc
88
  self.models[name] = mdl
 
227
  )
228
 
229
  start_time = time.time()
230
+
231
+ # Use efficient attention backend
232
+ if torch.cuda.is_available() and hasattr(torch.nn.attention, 'SDPBackend'):
233
+ from torch.nn.attention import SDPBackend, sdpa_kernel
234
+ with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
235
+ outputs = model.generate(
236
+ **inputs,
237
+ max_new_tokens=None,
238
+ cfg_scale=cfg_scale,
239
+ tokenizer=processor.tokenizer,
240
+ generation_config={'do_sample': False},
241
+ verbose=False,
242
+ )
243
+ else:
244
+ outputs = model.generate(
245
+ **inputs,
246
+ max_new_tokens=None,
247
+ cfg_scale=cfg_scale,
248
+ tokenizer=processor.tokenizer,
249
+ generation_config={'do_sample': False},
250
+ verbose=False,
251
+ )
252
  generation_time = time.time() - start_time
253
 
254
  if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None: