Spaces:

ACloudCenter
/

Conference-Generator-VibeVoice

Running on CPU Upgrade

App Files Files Community

ACloudCenter commited on Sep 4

Commit

0e9007a

1 Parent(s): 0a2959b

Modify with new transformer support

Browse files

Files changed (3) hide show

app.py +11 -27
requirements.txt +2 -10
setup.py +0 -89

app.py CHANGED Viewed

@@ -6,13 +6,10 @@ import librosa
 import soundfile as sf
 import torch
 import traceback
-import threading
 from spaces import GPU
 from datetime import datetime
-from modular.modeling_vibevoice_inference import VibeVoiceForConditionalGenerationInference
-from processor.vibevoice_processor import VibeVoiceProcessor
-from modular.streamer import AudioStreamer
 from transformers.utils import logging
 from transformers import set_seed
@@ -47,17 +44,6 @@ class VibeVoiceDemo:
     def load_models(self):
         print("Loading processors and models on CPU...")
-        # Debug: Show cache location
-        import os
-        cache_dir = os.path.expanduser("~/.cache/huggingface/hub")
-        print(f"HuggingFace cache directory: {cache_dir}")
-        if os.path.exists(cache_dir):
-            print(f"Cache exists. Size: {sum(os.path.getsize(os.path.join(dirpath, filename)) for dirpath, _, filenames in os.walk(cache_dir) for filename in filenames) / (1024**3):.2f} GB")
-            print("Cached models:")
-            for item in os.listdir(cache_dir):
-                if item.startswith("models--"):
-                    print(f"  - {item}")
         for name, path in self.model_paths.items():
             print(f" - {name} from {path}")
             proc = VibeVoiceProcessor.from_pretrained(path)
@@ -173,15 +159,7 @@ class VibeVoiceDemo:
             log += f"Parameters: CFG Scale={cfg_scale}\n"
             log += f"Speakers: {', '.join(selected_speakers)}\n"
-            voice_samples = []
-            for speaker_name in selected_speakers:
-                audio_path = self.available_voices[speaker_name]
-                audio_data = self.read_audio(audio_path)
-                if len(audio_data) == 0:
-                    raise gr.Error(f"Error: Failed to load audio for {speaker_name}")
-                voice_samples.append(audio_data)
-            log += f"Loaded {len(voice_samples)} voice samples\n"
             lines = script.strip().split('\n')
             formatted_script_lines = []
@@ -199,13 +177,18 @@ class VibeVoiceDemo:
             log += f"Formatted script with {len(formatted_script_lines)} turns\n"
             log += "Processing with VibeVoice...\n"
             inputs = processor(
                 text=[formatted_script],
-                voice_samples=[voice_samples],
                 padding=True,
                 return_tensors="pt",
-                return_attention_mask=True,
             )
             start_time = time.time()
             outputs = model.generate(
@@ -227,7 +210,8 @@ class VibeVoiceDemo:
             if audio.ndim > 1:
                 audio = audio.squeeze()
-            sample_rate = 24000
             output_dir = "outputs"
             os.makedirs(output_dir, exist_ok=True)

 import soundfile as sf
 import torch
 import traceback
 from spaces import GPU
 from datetime import datetime
+from transformers import VibeVoiceForConditionalGenerationInference, VibeVoiceProcessor
 from transformers.utils import logging
 from transformers import set_seed
     def load_models(self):
         print("Loading processors and models on CPU...")
         for name, path in self.model_paths.items():
             print(f" - {name} from {path}")
             proc = VibeVoiceProcessor.from_pretrained(path)
             log += f"Parameters: CFG Scale={cfg_scale}\n"
             log += f"Speakers: {', '.join(selected_speakers)}\n"
+            log += f"Using voice samples from selected speakers\n"
             lines = script.strip().split('\n')
             formatted_script_lines = []
             log += f"Formatted script with {len(formatted_script_lines)} turns\n"
             log += "Processing with VibeVoice...\n"
+            # Processor now expects file paths, not audio arrays
+            voice_sample_paths = [self.available_voices[speaker] for speaker in selected_speakers]
             inputs = processor(
                 text=[formatted_script],
+                voice_samples=[voice_sample_paths],
                 padding=True,
                 return_tensors="pt",
             )
+            # Move inputs to device
+            inputs = {key: val.to(self.device) if isinstance(val, torch.Tensor) else val for key, val in inputs.items()}
             start_time = time.time()
             outputs = model.generate(
             if audio.ndim > 1:
                 audio = audio.squeeze()
+            # Get sample rate from processor
+            sample_rate = processor.audio_processor.sampling_rate if hasattr(processor, 'audio_processor') else 24000
             output_dir = "outputs"
             os.makedirs(output_dir, exist_ok=True)

requirements.txt CHANGED Viewed

@@ -1,19 +1,11 @@
 spaces
 torch
-accelerate==1.6.0
-transformers==4.51.3
-diffusers
 tqdm
 numpy
 scipy
-ml-collections
-absl-py
 gradio
-av
-aiortc
 soundfile
 librosa
-pydub
-requests
-python-dotenv

 spaces
 torch
+accelerate
+git+https://github.com/huggingface/transformers.git@refs/pull/40546/head
 tqdm
 numpy
 scipy
 gradio
 soundfile
 librosa

setup.py DELETED Viewed

@@ -1,89 +0,0 @@
-import os
-import subprocess
-import sys
-import shutil
-from pathlib import Path
-# setup.py will clone and install VibeVoice, copy voice files if they exist
-def setup_vibevoice():
-    repo_dir = "VibeVoice"
-    original_dir = os.getcwd()
-    # clone repo if needed
-    if not os.path.exists(repo_dir):
-        print("Cloning the VibeVoice repository...")
-        try:
-            subprocess.run(
-                ["git", "clone", "https://github.com/vibevoice-community/VibeVoice"],
-                check=True,
-                capture_output=True,
-                text=True
-            )
-            print("Repository cloned successfully.")
-        except subprocess.CalledProcessError as e:
-            print(f"Error cloning repository: {e.stderr}")
-            sys.exit(1)
-    else:
-        print("Repository already exists. Skipping clone.")
-    # install the package
-    os.chdir(repo_dir)
-    print(f"Changed directory to: {os.getcwd()}")
-    print("Installing the VibeVoice package...")
-    try:
-        subprocess.run(
-            [sys.executable, "-m", "pip", "install", "-e", "."],
-            check=True,
-            capture_output=True,
-            text=True
-        )
-        print("Package installed successfully.")
-    except subprocess.CalledProcessError as e:
-        print(f"Error installing package: {e.stderr}")
-        sys.exit(1)
-    # add to python path
-    sys.path.insert(0, os.getcwd())
-    # go back to original directory
-    os.chdir(original_dir)
-    print(f"Changed back to original directory: {os.getcwd()}")
-    # copy voice files if they exist
-    if os.path.exists("public/voices"):
-        target_voices_dir = os.path.join(repo_dir, "demo", "voices")
-        # clear existing voices and use only ours
-        if os.path.exists(target_voices_dir):
-            shutil.rmtree(target_voices_dir)
-        os.makedirs(target_voices_dir)
-        for file in os.listdir("public/voices"):
-            if file.endswith(('.mp3', '.wav', '.flac', '.ogg', '.m4a', '.aac')):
-                src = os.path.join("public/voices", file)
-                dst = os.path.join(target_voices_dir, file)
-                shutil.copy2(src, dst)
-                print(f"Copied voice file: {file}")
-    return repo_dir
-def setup_voice_presets():
-    # get voice files from vibevoice demo directory
-    voices_dir = Path("VibeVoice/demo/voices")
-    voice_presets = {}
-    audio_extensions = ('.wav', '.mp3', '.flac', '.ogg', '.m4a', '.aac')
-    if voices_dir.exists():
-        for audio_file in voices_dir.glob("*"):
-            if audio_file.suffix.lower() in audio_extensions:
-                name = audio_file.stem
-                voice_presets[name] = str(audio_file)
-    # if no voices found, create directory
-    if not voice_presets and not voices_dir.exists():
-        voices_dir.mkdir(parents=True, exist_ok=True)
-    return dict(sorted(voice_presets.items()))