Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Commit
·
63cd7b1
1
Parent(s):
0a306a9
Modify layout
Browse files- app.py +56 -18
- public/images/{diagram.png → chart.png} +2 -2
- public/images/diagram.jpg +3 -0
app.py
CHANGED
|
@@ -451,36 +451,74 @@ def create_demo_interface(demo_instance: VibeVoiceDemo):
|
|
| 451 |
with gr.Tab("Architecture"):
|
| 452 |
with gr.Row():
|
| 453 |
with gr.Column():
|
| 454 |
-
gr.Markdown("## VibeVoice
|
| 455 |
|
| 456 |
gr.Markdown("""
|
| 457 |
-
###
|
| 458 |
-
|
| 459 |
-
VibeVoice
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
|
| 461 |
-
|
| 462 |
-
-
|
| 463 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 464 |
|
| 465 |
-
|
| 466 |
|
| 467 |
-
|
| 468 |
|
| 469 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 470 |
|
| 471 |
### Technical Specifications
|
| 472 |
-
- **
|
| 473 |
-
- **
|
| 474 |
-
- **
|
| 475 |
-
- **
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
""")
|
| 477 |
|
| 478 |
with gr.Column(scale=2):
|
| 479 |
gr.HTML("""
|
| 480 |
-
<div style="text-align: center;
|
| 481 |
-
<
|
| 482 |
-
|
| 483 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 484 |
</div>
|
| 485 |
""")
|
| 486 |
|
|
|
|
| 451 |
with gr.Tab("Architecture"):
|
| 452 |
with gr.Row():
|
| 453 |
with gr.Column():
|
| 454 |
+
gr.Markdown("## VibeVoice: A Frontier Open-Source Text-to-Speech Model")
|
| 455 |
|
| 456 |
gr.Markdown("""
|
| 457 |
+
### Overview
|
| 458 |
+
|
| 459 |
+
VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio,
|
| 460 |
+
such as podcasts, from text. It addresses significant challenges in traditional Text-to-Speech (TTS) systems,
|
| 461 |
+
particularly in scalability, speaker consistency, and natural turn-taking.
|
| 462 |
+
|
| 463 |
+
### Training Architecture
|
| 464 |
+
|
| 465 |
+
**Transformer-based Large Language Model** integrated with specialized acoustic and semantic tokenizers and a diffusion-based decoding head.
|
| 466 |
|
| 467 |
+
**Core Components:**
|
| 468 |
+
- **LLM**: Qwen2.5-1.5B for this release
|
| 469 |
+
- **Acoustic Tokenizer**: Based on a σ-VAE variant with mirror-symmetric encoder-decoder structure (~340M parameters each)
|
| 470 |
+
- 7 stages of modified Transformer blocks
|
| 471 |
+
- Achieves 3200x downsampling from 24kHz input
|
| 472 |
+
- **Semantic Tokenizer**: Encoder mirrors the Acoustic Tokenizer's architecture
|
| 473 |
+
- Trained with an ASR proxy task
|
| 474 |
+
- **Diffusion Head**: Lightweight module (4 layers, ~123M parameters)
|
| 475 |
+
- Conditioned on LLM hidden states
|
| 476 |
+
- Uses DDPM process with Classifier-Free Guidance
|
| 477 |
|
| 478 |
+
### Training Details
|
| 479 |
|
| 480 |
+
**Context Length**: Trained with curriculum up to 65,536 tokens
|
| 481 |
|
| 482 |
+
**Training Stages:**
|
| 483 |
+
1. **Tokenizer Pre-training**: Acoustic and Semantic tokenizers trained separately
|
| 484 |
+
2. **VibeVoice Training**: Frozen tokenizers, only LLM and diffusion head trained
|
| 485 |
+
- Curriculum learning: 4k → 16K → 32K → 64K tokens
|
| 486 |
+
|
| 487 |
+
### Model Variants
|
| 488 |
+
|
| 489 |
+
| Model | Context Length | Generation Length | Parameters |
|
| 490 |
+
|-------|---------------|-------------------|------------|
|
| 491 |
+
| VibeVoice-0.5B-Streaming | - | - | Coming Soon |
|
| 492 |
+
| **VibeVoice-1.5B** | 64K | ~90 min | 2.7B |
|
| 493 |
+
| VibeVoice-Large | 32K | ~45 min | Available |
|
| 494 |
|
| 495 |
### Technical Specifications
|
| 496 |
+
- **Frame Rate**: Ultra-low 7.5 Hz for efficiency
|
| 497 |
+
- **Sample Rate**: 24kHz audio output
|
| 498 |
+
- **Max Duration**: Up to 90 minutes
|
| 499 |
+
- **Speaker Capacity**: 1-4 distinct speakers
|
| 500 |
+
- **Languages**: English and Chinese
|
| 501 |
+
|
| 502 |
+
### Key Innovations
|
| 503 |
+
- Continuous speech tokenizers at ultra-low frame rate
|
| 504 |
+
- Next-token diffusion framework
|
| 505 |
+
- Curriculum learning for long-form generation
|
| 506 |
+
- Multi-speaker consistency without explicit modeling
|
| 507 |
""")
|
| 508 |
|
| 509 |
with gr.Column(scale=2):
|
| 510 |
gr.HTML("""
|
| 511 |
+
<div style="text-align: center;">
|
| 512 |
+
<div style="margin: 20px 0;">
|
| 513 |
+
<img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/diagram.png"
|
| 514 |
+
style="max-width: 100%; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);"
|
| 515 |
+
alt="VibeVoice Architecture Diagram">
|
| 516 |
+
</div>
|
| 517 |
+
<div style="margin: 20px 0;">
|
| 518 |
+
<img src="https://huggingface.co/spaces/ACloudCenter/Conference-Generator-VibeVoice/resolve/main/public/images/chart.png"
|
| 519 |
+
style="max-width: 100%; height: auto; border-radius: 10px; box-shadow: 0 4px 6px rgba(0,0,0,0.1);"
|
| 520 |
+
alt="VibeVoice Performance Chart">
|
| 521 |
+
</div>
|
| 522 |
</div>
|
| 523 |
""")
|
| 524 |
|
public/images/{diagram.png → chart.png}
RENAMED
|
File without changes
|
public/images/diagram.jpg
ADDED
|
Git LFS Details
|