Marcos Remar commited on
Commit
0f60c53
·
1 Parent(s): 84c36f0

Add working TTS test script to tests directory

Browse files
Files changed (1) hide show
  1. tests/final_tts_test.py +107 -0
tests/final_tts_test.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import sys
3
+ sys.path.append('third_party/Matcha-TTS')
4
+
5
+ print("=== CosyVoice TTS Final Test ===\n")
6
+
7
+ try:
8
+ from cosyvoice.cli.cosyvoice import CosyVoice
9
+ from cosyvoice.utils.file_utils import load_wav
10
+ import torchaudio
11
+ import os
12
+
13
+ # Use the complete model
14
+ model_path = 'pretrained_models/CosyVoice-300M-direct'
15
+
16
+ print(f"Loading model from: {model_path}")
17
+ cosyvoice = CosyVoice(model_path, load_jit=False, load_trt=False, fp16=False)
18
+ print("✓ Model loaded successfully!")
19
+
20
+ # Load prompt audio
21
+ prompt_speech_16k = load_wav('asset/zero_shot_prompt.wav', 16000)
22
+ print("✓ Prompt audio loaded")
23
+
24
+ # Test different languages
25
+ tests = [
26
+ {
27
+ 'lang': 'Portuguese',
28
+ 'text': 'Olá! Bem-vindo ao teste de síntese de voz do CosyVoice. Este sistema converte texto em fala de forma natural.',
29
+ 'prompt': 'Teste de voz em português.',
30
+ 'output': 'output_portuguese.wav'
31
+ },
32
+ {
33
+ 'lang': 'English',
34
+ 'text': 'Hello! This is a test of the CosyVoice text-to-speech system. It can generate natural sounding speech.',
35
+ 'prompt': 'Testing English speech synthesis.',
36
+ 'output': 'output_english.wav'
37
+ },
38
+ {
39
+ 'lang': 'Chinese',
40
+ 'text': '你好!这是CosyVoice语音合成系统的测试。它可以生成自然的语音。',
41
+ 'prompt': '测试中文语音合成。',
42
+ 'output': 'output_chinese.wav'
43
+ }
44
+ ]
45
+
46
+ for test in tests:
47
+ print(f"\n{'='*50}")
48
+ print(f"Testing {test['lang']}:")
49
+ print(f"Text: {test['text']}")
50
+ print(f"Generating speech...")
51
+
52
+ # Generate speech
53
+ for i, j in enumerate(cosyvoice.inference_zero_shot(
54
+ test['text'],
55
+ test['prompt'],
56
+ prompt_speech_16k,
57
+ stream=False
58
+ )):
59
+ # Save audio
60
+ torchaudio.save(test['output'], j['tts_speech'], cosyvoice.sample_rate)
61
+
62
+ # Show info
63
+ duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
64
+ size_kb = os.path.getsize(test['output']) / 1024
65
+
66
+ print(f"✓ Generated: {test['output']}")
67
+ print(f" Duration: {duration:.2f} seconds")
68
+ print(f" Size: {size_kb:.1f} KB")
69
+ print(f" Sample rate: {cosyvoice.sample_rate} Hz")
70
+
71
+ # Test streaming mode
72
+ print(f"\n{'='*50}")
73
+ print("Testing streaming mode:")
74
+ text = "Este é um teste do modo streaming, que permite gerar áudio em tempo real com baixa latência."
75
+ print(f"Text: {text}")
76
+
77
+ chunk_count = 0
78
+ total_duration = 0
79
+ for i, j in enumerate(cosyvoice.inference_zero_shot(text, "Modo streaming.", prompt_speech_16k, stream=True)):
80
+ chunk_count += 1
81
+ chunk_duration = j['tts_speech'].shape[1] / cosyvoice.sample_rate
82
+ total_duration += chunk_duration
83
+ print(f" Chunk {chunk_count}: {chunk_duration:.3f}s")
84
+
85
+ # Save first chunk as example
86
+ if chunk_count == 1:
87
+ torchaudio.save('output_streaming_chunk1.wav', j['tts_speech'], cosyvoice.sample_rate)
88
+
89
+ print(f"✓ Generated {chunk_count} chunks, total duration: {total_duration:.2f}s")
90
+
91
+ print(f"\n{'='*50}")
92
+ print("✅ All tests completed successfully!")
93
+
94
+ # Summary
95
+ print("\n📁 Generated files:")
96
+ for f in sorted(os.listdir('.')):
97
+ if f.startswith('output_') and f.endswith('.wav'):
98
+ size = os.path.getsize(f) / 1024
99
+ print(f" - {f} ({size:.1f} KB)")
100
+
101
+ print("\n💡 To download files to your local machine:")
102
+ print("scp -P 40053 -i ~/.ssh/id_ed25519 root@213.192.2.74:~/CosyVoice/output_*.wav .")
103
+
104
+ except Exception as e:
105
+ print(f"\n❌ Error: {e}")
106
+ import traceback
107
+ traceback.print_exc()