ACloudCenter commited on
Commit
93a624e
·
1 Parent(s): 34aa391

Revert from transformer version

Browse files
Files changed (2) hide show
  1. app.py +29 -13
  2. requirements.txt +9 -2
app.py CHANGED
@@ -6,10 +6,13 @@ import librosa
6
  import soundfile as sf
7
  import torch
8
  import traceback
 
9
  from spaces import GPU
10
  from datetime import datetime
11
 
12
- from transformers import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
 
 
13
  from transformers.utils import logging
14
  from transformers import set_seed
15
 
@@ -44,6 +47,17 @@ class VibeVoiceDemo:
44
  def load_models(self):
45
  print("Loading processors and models on CPU...")
46
 
 
 
 
 
 
 
 
 
 
 
 
47
  for name, path in self.model_paths.items():
48
  print(f" - {name} from {path}")
49
  proc = VibeVoiceProcessor.from_pretrained(path)
@@ -159,7 +173,15 @@ class VibeVoiceDemo:
159
  log += f"Parameters: CFG Scale={cfg_scale}\n"
160
  log += f"Speakers: {', '.join(selected_speakers)}\n"
161
 
162
- log += f"Using voice samples from selected speakers\n"
 
 
 
 
 
 
 
 
163
 
164
  lines = script.strip().split('\n')
165
  formatted_script_lines = []
@@ -177,18 +199,13 @@ class VibeVoiceDemo:
177
  log += f"Formatted script with {len(formatted_script_lines)} turns\n"
178
  log += "Processing with VibeVoice...\n"
179
 
180
- # Processor now expects file paths, not audio arrays
181
- voice_sample_paths = [self.available_voices[speaker] for speaker in selected_speakers]
182
-
183
  inputs = processor(
184
  text=[formatted_script],
185
- voice_samples=[voice_sample_paths],
186
  padding=True,
187
  return_tensors="pt",
 
188
  )
189
-
190
- # Move inputs to device
191
- inputs = {key: val.to(self.device) if isinstance(val, torch.Tensor) else val for key, val in inputs.items()}
192
 
193
  start_time = time.time()
194
  outputs = model.generate(
@@ -196,7 +213,7 @@ class VibeVoiceDemo:
196
  max_new_tokens=None,
197
  cfg_scale=cfg_scale,
198
  tokenizer=processor.tokenizer,
199
- generation_config={'do_sample': False, 'use_cache': False},
200
  verbose=False,
201
  )
202
  generation_time = time.time() - start_time
@@ -210,8 +227,7 @@ class VibeVoiceDemo:
210
  if audio.ndim > 1:
211
  audio = audio.squeeze()
212
 
213
- # Get sample rate from processor
214
- sample_rate = processor.audio_processor.sampling_rate if hasattr(processor, 'audio_processor') else 24000
215
 
216
  output_dir = "outputs"
217
  os.makedirs(output_dir, exist_ok=True)
@@ -714,7 +730,7 @@ def run_demo(
714
  model_paths: dict = None,
715
  device: str = "cuda",
716
  inference_steps: int = 5,
717
- share: bool = True,
718
  ):
719
  """
720
  model_paths default includes two entries. Replace paths as needed.
 
6
  import soundfile as sf
7
  import torch
8
  import traceback
9
+ import threading
10
  from spaces import GPU
11
  from datetime import datetime
12
 
13
+ from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
14
+ from processor.vibevoice_processor import VibeVoiceProcessor
15
+ from modular.streamer import AudioStreamer
16
  from transformers.utils import logging
17
  from transformers import set_seed
18
 
 
47
  def load_models(self):
48
  print("Loading processors and models on CPU...")
49
 
50
+ # Debug: Show cache location
51
+ import os
52
+ cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
53
+ print(f"HuggingFace cache directory: {cache_dir}")
54
+ if os.path.exists(cache_dir):
55
+ print(f"Cache exists. Size: {sum(os.path.getsize(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk(cache_dir) for filename in filenames) / (1024**3):.2f} GB")
56
+ print("Cached models:")
57
+ for item in os.listdir(cache_dir):
58
+ if item.startswith("models--"):
59
+ print(f" - {item}")
60
+
61
  for name, path in self.model_paths.items():
62
  print(f" - {name} from {path}")
63
  proc = VibeVoiceProcessor.from_pretrained(path)
 
173
  log += f"Parameters: CFG Scale={cfg_scale}\n"
174
  log += f"Speakers: {', '.join(selected_speakers)}\n"
175
 
176
+ voice_samples = []
177
+ for speaker_name in selected_speakers:
178
+ audio_path = self.available_voices[speaker_name]
179
+ audio_data = self.read_audio(audio_path)
180
+ if len(audio_data) == 0:
181
+ raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
182
+ voice_samples.append(audio_data)
183
+
184
+ log += f"Loaded {len(voice_samples)} voice samples\n"
185
 
186
  lines = script.strip().split('\n')
187
  formatted_script_lines = []
 
199
  log += f"Formatted script with {len(formatted_script_lines)} turns\n"
200
  log += "Processing with VibeVoice...\n"
201
 
 
 
 
202
  inputs = processor(
203
  text=[formatted_script],
204
+ voice_samples=[voice_samples],
205
  padding=True,
206
  return_tensors="pt",
207
+ return_attention_mask=True,
208
  )
 
 
 
209
 
210
  start_time = time.time()
211
  outputs = model.generate(
 
213
  max_new_tokens=None,
214
  cfg_scale=cfg_scale,
215
  tokenizer=processor.tokenizer,
216
+ generation_config={'do_sample': False},
217
  verbose=False,
218
  )
219
  generation_time = time.time() - start_time
 
227
  if audio.ndim > 1:
228
  audio = audio.squeeze()
229
 
230
+ sample_rate = 24000
 
231
 
232
  output_dir = "outputs"
233
  os.makedirs(output_dir, exist_ok=True)
 
730
  model_paths: dict = None,
731
  device: str = "cuda",
732
  inference_steps: int = 5,
733
+ share: bool = False,
734
  ):
735
  """
736
  model_paths default includes two entries. Replace paths as needed.
requirements.txt CHANGED
@@ -1,12 +1,19 @@
1
  spaces
2
  torch
3
- accelerate
4
- git+https://github.com/huggingface/transformers.git@refs/pull/40546/head
5
  diffusers
6
  tqdm
7
  numpy
8
  scipy
 
 
9
  gradio
 
 
10
  soundfile
11
  librosa
 
 
 
12
 
 
1
  spaces
2
  torch
3
+ accelerate==1.6.0
4
+ transformers==4.51.3
5
  diffusers
6
  tqdm
7
  numpy
8
  scipy
9
+ ml-collections
10
+ absl-py
11
  gradio
12
+ av
13
+ aiortc
14
  soundfile
15
  librosa
16
+ pydub
17
+ requests
18
+ python-dotenv
19