Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
601dafe
1
Parent(s):
a08f8f8
Implement faster inference times - test
Browse files
app.py
CHANGED
|
@@ -9,6 +9,7 @@ import traceback
|
|
| 9 |
import threading
|
| 10 |
from spaces import GPU
|
| 11 |
from datetime import datetime
|
|
|
|
| 12 |
|
| 13 |
from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
| 14 |
from processor.vibevoice_processor import VibeVoiceProcessor
|
|
@@ -39,6 +40,14 @@ class VibeVoiceDemo:
|
|
| 39 |
self.current_model_name = None
|
| 40 |
|
| 41 |
self.available_voices = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
|
| 43 |
self.load_models() # load all on CPU
|
| 44 |
self.setup_voice_presets()
|
|
@@ -61,9 +70,19 @@ class VibeVoiceDemo:
|
|
| 61 |
for name, path in self.model_paths.items():
|
| 62 |
print(f" - {name} from {path}")
|
| 63 |
proc = VibeVoiceProcessor.from_pretrained(path)
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 67 |
# Keep on CPU initially
|
| 68 |
self.processors[name] = proc
|
| 69 |
self.models[name] = mdl
|
|
@@ -208,14 +227,28 @@ class VibeVoiceDemo:
|
|
| 208 |
)
|
| 209 |
|
| 210 |
start_time = time.time()
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 219 |
generation_time = time.time() - start_time
|
| 220 |
|
| 221 |
if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
|
|
|
|
| 9 |
import threading
|
| 10 |
from spaces import GPU
|
| 11 |
from datetime import datetime
|
| 12 |
+
from contextlib import contextmanager
|
| 13 |
|
| 14 |
from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
|
| 15 |
from processor.vibevoice_processor import VibeVoiceProcessor
|
|
|
|
| 40 |
self.current_model_name = None
|
| 41 |
|
| 42 |
self.available_voices = {}
|
| 43 |
+
|
| 44 |
+
# Set compiler flags for better performance
|
| 45 |
+
if torch.cuda.is_available() and hasattr(torch, '_inductor'):
|
| 46 |
+
if hasattr(torch._inductor, 'config'):
|
| 47 |
+
torch._inductor.config.conv_1x1_as_mm = True
|
| 48 |
+
torch._inductor.config.coordinate_descent_tuning = True
|
| 49 |
+
torch._inductor.config.epilogue_fusion = False
|
| 50 |
+
torch._inductor.config.coordinate_descent_check_all_directions = True
|
| 51 |
|
| 52 |
self.load_models() # load all on CPU
|
| 53 |
self.setup_voice_presets()
|
|
|
|
| 70 |
for name, path in self.model_paths.items():
|
| 71 |
print(f" - {name} from {path}")
|
| 72 |
proc = VibeVoiceProcessor.from_pretrained(path)
|
| 73 |
+
# Try to use flash attention if available
|
| 74 |
+
try:
|
| 75 |
+
mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
| 76 |
+
path,
|
| 77 |
+
torch_dtype=torch.bfloat16,
|
| 78 |
+
attn_implementation="flash_attention_2"
|
| 79 |
+
)
|
| 80 |
+
print(f" Flash Attention 2 enabled for {name}")
|
| 81 |
+
except:
|
| 82 |
+
# Fallback to default attention
|
| 83 |
+
mdl = VibeVoiceForConditionalGenerationInference.from_pretrained(
|
| 84 |
+
path, torch_dtype=torch.bfloat16
|
| 85 |
+
)
|
| 86 |
# Keep on CPU initially
|
| 87 |
self.processors[name] = proc
|
| 88 |
self.models[name] = mdl
|
|
|
|
| 227 |
)
|
| 228 |
|
| 229 |
start_time = time.time()
|
| 230 |
+
|
| 231 |
+
# Use efficient attention backend
|
| 232 |
+
if torch.cuda.is_available() and hasattr(torch.nn.attention, 'SDPBackend'):
|
| 233 |
+
from torch.nn.attention import SDPBackend, sdpa_kernel
|
| 234 |
+
with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
|
| 235 |
+
outputs = model.generate(
|
| 236 |
+
**inputs,
|
| 237 |
+
max_new_tokens=None,
|
| 238 |
+
cfg_scale=cfg_scale,
|
| 239 |
+
tokenizer=processor.tokenizer,
|
| 240 |
+
generation_config={'do_sample': False},
|
| 241 |
+
verbose=False,
|
| 242 |
+
)
|
| 243 |
+
else:
|
| 244 |
+
outputs = model.generate(
|
| 245 |
+
**inputs,
|
| 246 |
+
max_new_tokens=None,
|
| 247 |
+
cfg_scale=cfg_scale,
|
| 248 |
+
tokenizer=processor.tokenizer,
|
| 249 |
+
generation_config={'do_sample': False},
|
| 250 |
+
verbose=False,
|
| 251 |
+
)
|
| 252 |
generation_time = time.time() - start_time
|
| 253 |
|
| 254 |
if hasattr(outputs, 'speech_outputs') and outputs.speech_outputs[0] is not None:
|