lihongjie commited on Sep 12, 2025

Commit

4ae14db

1 Parent(s): 677b20d

update

Files changed (27) hide show

.gitattributes +10 -0
.gitignore +0 -0
README.md +58 -49
asset/{dingding.png → en_man1.mp3} +2 -2
asset/{cross_lingual_prompt.wav → en_man1.txt} +2 -2
asset/en_woman1.mp3 +3 -0
asset/en_woman1.txt +3 -0
asset/zh_man1.txt +3 -0
asset/zh_man1.wav +3 -0
asset/zh_man2.mp3 +3 -0
asset/zh_man2.txt +3 -0
asset/zh_woman1.txt +3 -0
asset/{zero_shot_prompt.wav → zh_woman1.wav} +0 -0
config.json +0 -0
main_ax650 +2 -2
prompt_files/flow_embedding_1_192.txt +0 -192
prompt_files/flow_prompt_speech_token_1_87.txt +0 -87
prompt_files/llm_embedding_1_192.txt +0 -192
prompt_files/llm_prompt_speech_token_1_87.txt +0 -87
prompt_files/prompt_speech_feat_1_174_80.txt +0 -0
prompt_files/prompt_text_1_15.txt +0 -15
prompt_files/text_1_38.txt +0 -38
run.sh +3 -2
scripts/frontend.py +22 -1
scripts/process_prompt.py +21 -18
{prompt_files → token2wav-axmodels}/rand_noise_1_80_300.txt +0 -0
{prompt_files → token2wav-axmodels}/speech_window_2x8x480.txt +0 -0

.gitattributes CHANGED Viewed

@@ -88,3 +88,13 @@ main_ax650 filter=lfs diff=lfs merge=lfs -text
 token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
 token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
 scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text

 token2wav-axmodels/flow_estimator_200.axmodel filter=lfs diff=lfs merge=lfs -text
 token2wav-axmodels/flow.input_embedding.float16.bin filter=lfs diff=lfs merge=lfs -text
 scripts/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken filter=lfs diff=lfs merge=lfs -text
+asset/zh_man1.wav filter=lfs diff=lfs merge=lfs -text
+asset/zh_man2.mp3 filter=lfs diff=lfs merge=lfs -text
+asset/zh_woman1.txt filter=lfs diff=lfs merge=lfs -text
+asset/en_man1.mp3 filter=lfs diff=lfs merge=lfs -text
+asset/en_man1.txt filter=lfs diff=lfs merge=lfs -text
+asset/en_woman1.mp3 filter=lfs diff=lfs merge=lfs -text
+asset/en_woman1.txt filter=lfs diff=lfs merge=lfs -text
+asset/zh_man1.txt filter=lfs diff=lfs merge=lfs -text
+asset/zh_woman1.wav filter=lfs diff=lfs merge=lfs -text
+asset/zh_man2.txt filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

File without changes

README.md CHANGED Viewed

@@ -43,65 +43,25 @@ Download all files from this repository to the device
 ### 1. Text to Speech (Voice Cloning)
-#### 1. Prepare Dependencies
-##### (1). Install python library
-Steps 2 and 3 require the use of these Python packages. If you run Steps 2 and 3 on a PC, install them on the PC.
 ```
 pip3 install -r scripts/requirements.txt
 ```
-##### (2). Downlaod wetext
-```
-pip3 install modelscope
-modelscope download --model pengzhendong/wetext --local_dir pengzhendong/wetext
-```
-#### 2. Process Prompt Speech
-```
-python scripts/process_prompt.py
-```
-Pass parameters according to the actual situation.
-```
-args.add_argument('--model_dir', type=str, default="../../model_convert/pretrained_models/CosyVoice2-0.5B/")
-args.add_argument('--wetext_dir', type=str, default="../../model_convert/pengzhendong/wetext/")
-args.add_argument('--sample_rate', type=int, default=24000)
-args.add_argument('--zero_shot_spk_id', type=str, default="")
-args.add_argument('--tts_text', type=str, default="君不见黄河之水天上来，奔流到海不复回。君不见高堂明镜悲白发，朝如青丝暮成雪。")
-args.add_argument('--prompt_text', type=str, default="希望你以后能够做的比我还好呦。")
-args.add_argument('--prompt_speech', type=str, default="../../model_convert/asset/zero_shot_prompt.wav")
-```
-执行完上述命令，会生成类似以下的文件：
-```
-prompt_text_1_15.txt
-llm_prompt_speech_token_1_87.txt
-flow_prompt_speech_token_1_87.txt
-prompt_speech_feat_1_174_80.txt
-llm_embedding_1_192.txt
-flow_embedding_1_192.txt
-text_1_38.txt
-rand_noise_1_80_300.txt
-speech_window_2x8x480.txt
-```
-The prompt_files directory contains files generated based on the default prompt speech. You can use them directly without running this command.
-#### 3. Start HTTP Tokenizer Server
 ```
 cd scripts
 python cosyvoice2_tokenizer.py --host {your host} --port {your port}
 ```
-#### 4. Run on AX650 Board
 1) Moidfy the HTTP host in `run.sh`.
-2) Copy  these files  to AX650 Board
-```
-run.sh main_ax650 prompt_text_1_15.txt llm_prompt_speech_token_1_87.txt flow_prompt_speech_token_1_87.txt \
-prompt_speech_feat_1_174_80.txt llm_embedding_1_192.txt flow_embedding_1_192.txt text_1_38.txt \
-rand_noise_1_80_300.txt speech_window_2x8x480.txt \
-CosyVoice-BlankEN-Ax650-prefill_512/ token2wav-axmodels/
-```
-3) Run `run.sh`
 ```shell
 root@ax650 ~/Cosyvoice2 # bash run.sh
 rm: cannot remove 'output*.wav': No such file or directory
@@ -163,4 +123,53 @@ text >>
 ```
 Output Speech：
-[output.wav](asset/output.wav)

 ### 1. Text to Speech (Voice Cloning)
+#### (1) Copy this project to AX650 Board
+#### (2). Prepare Dependencies
+**Running HTTP Tokenizer Server** and **Processing Prompt Speech** require these Python packages. If you run these two step on a PC, install them on the PC.
 ```
 pip3 install -r scripts/requirements.txt
 ```
+#### 2. Start HTTP Tokenizer Server
 ```
 cd scripts
 python cosyvoice2_tokenizer.py --host {your host} --port {your port}
 ```
+#### 3. Run on AX650 Board
 1) Moidfy the HTTP host in `run.sh`.
+2) Run `run.sh`
 ```shell
 root@ax650 ~/Cosyvoice2 # bash run.sh
 rm: cannot remove 'output*.wav': No such file or directory
 ```
 Output Speech：
+[output.wav](asset/output.wav)
+#### Optional. Process Prompt Speech
+If you want to replicate a specific sound, do this step.
+##### (1). Downlaod wetext
+```
+pip3 install modelscope
+modelscope download --model pengzhendong/wetext --local_dir pengzhendong/wetext
+```
+##### (2). Process Prompt Speech
+```
+python scripts/process_prompt.py
+```
+Pass parameters according to the actual situation.
+```
+python scripts/process_prompt.py -h
+usage: process_prompt.py [-h] [--model_dir MODEL_DIR] [--wetext_dir WETEXT_DIR] [--sample_rate SAMPLE_RATE] [--prompt_text PROMPT_TEXT] [--prompt_speech PROMPT_SPEECH]
+                         [--output OUTPUT]
+options:
+  -h, --help            show this help message and exit
+  --model_dir MODEL_DIR
+                        tokenizer configuration directionary
+  --wetext_dir WETEXT_DIR
+                        path to wetext
+  --sample_rate SAMPLE_RATE
+                        Sampling rate for prompt audio
+  --prompt_text PROMPT_TEXT
+                        The text content of the prompt(reference) audio. Text or file path.
+  --prompt_speech PROMPT_SPEECH
+                        The path to prompt(reference) audio.
+  --output OUTPUT       Output data storage directory
+```
+After executing the above command, files like the following will be generated:
+```
+flow_embedding.txt
+flow_prompt_speech_token.txt
+llm_embedding.txt
+llm_prompt_speech_token.txt
+prompt_speech_feat.txt
+prompt_text.txt
+```
+When you run run.sh, pass the output path here to the prompt_files parameter of the run.sh script.

asset/{dingding.png → en_man1.mp3} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3870bb0a4e3df1f643e09c960b7e03d80da798509c86eaa326db205236b861d5
-size 96417

 version https://git-lfs.github.com/spec/v1
+oid sha256:461dd4cc9cf5bf6b774a9978cc9b7ca96033b214714b12413ecfe9eb1bf03ab9
+size 15309

asset/{cross_lingual_prompt.wav → en_man1.txt} RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:353a7715c2e4811f4045658b29d1ce67ecad5120e09de10ce890f1763aab486c
-size 606404

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce5d3c2b96bf649e61817fd44c913c9abfa2314b3265ad6f115fd5c2477cc017
+size 66

asset/en_woman1.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:872ff69b74b37763cfc4a49bdd39d8a2acf51f428e42e1ab9fa3dfc0c4a2e3d4
+size 16941

asset/en_woman1.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c14383963cba5217b00603065c4c1fc4167155d5c8ae8d6b5b6b92c81b8eef6b
+size 67

asset/zh_man1.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9b54ab8e18581b2fce95bd8e4f8aa4e840beec28d56304b86359e095c57bce
+size 57

asset/zh_man1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da1153fca1303cd20470317a4ba93027cc5e172214b777747215add36f41109e
+size 1536044

asset/zh_man2.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd012ac30fe1ffb5bc3e356a84f4f668a25a62c72f810ffae218f83cbcfdf53e
+size 31761

asset/zh_man2.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0c68ca97e76f6a966fbee90d95ba2210dad6f1c07fcae0f445282b0035823472
+size 69

asset/zh_woman1.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4ff1a7dd8cb643e4f769735733e7547ff66aa5b29d99f674131f3fb448446efa
+size 45

asset/{zero_shot_prompt.wav → zh_woman1.wav} RENAMED Viewed

File without changes

config.json CHANGED Viewed

File without changes

main_ax650 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:12b33e0ad1f44c3b6fa18a3e4ddbcc467db834f9dc85716073781f9e8041be8e
-size 6637440

 version https://git-lfs.github.com/spec/v1
+oid sha256:8d36bca7a681437db6ece77226fa6e00605d613b2ab7028de50cd525ec6575c6
+size 6641632

prompt_files/flow_embedding_1_192.txt DELETED Viewed

@@ -1,192 +0,0 @@
-9.833880662918090820e-01
-3.727950453758239746e-01
--6.032654047012329102e-01
--5.367950201034545898e-01
-1.168318033218383789e+00
--4.463896751403808594e-01
--8.533740043640136719e-02
-6.770751476287841797e-01
-4.399921894073486328e-01
--8.947002887725830078e-02
--4.119307696819305420e-01
--1.400911569595336914e+00
-1.469335317611694336e+00
-7.800692915916442871e-01
-6.252028942108154297e-01
--1.524239063262939453e+00
-4.095870852470397949e-01
-8.922567367553710938e-01
-1.414063215255737305e+00
--3.570723533630371094e-01
-3.816263973712921143e-01
--2.559853792190551758e-01
-9.759899973869323730e-01
--2.347678393125534058e-01
--8.310836553573608398e-01
--1.119347572326660156e+00
-6.822414696216583252e-02
-1.058485746383666992e+00
-2.381889820098876953e-01
--2.013707756996154785e-01
--4.302661716938018799e-01
--1.057960271835327148e+00
-1.127839088439941406e+00
--1.518161177635192871e+00
--5.298921465873718262e-01
--1.788670778274536133e+00
--3.309334218502044678e-01
-1.011094689369201660e+00
--3.399490118026733398e-01
--5.792245864868164062e-01
-3.723595738410949707e-01
--3.795529901981353760e-02
--9.215813875198364258e-01
--2.451439201831817627e-01
--1.136183738708496094e+00
-9.513977169990539551e-02
-7.262014746665954590e-01
--9.598007798194885254e-01
--5.060364603996276855e-01
--2.999072074890136719e-01
--7.779634594917297363e-01
-1.212495565414428711e+00
-3.001802563667297363e-01
--2.383058547973632812e+00
-1.490965783596038818e-01
-5.186975002288818359e-02
-1.555646419525146484e+00
--7.905082702636718750e-01
-6.895875930786132812e-01
--7.865182161331176758e-01
--1.267613649368286133e+00
-5.915310978889465332e-01
--3.206543624401092529e-01
-3.275410532951354980e-01
--7.800404429435729980e-01
-2.810131907463073730e-01
--5.581974983215332031e-02
--6.896089911460876465e-01
--1.699091911315917969e+00
-8.533768653869628906e-01
--1.143321990966796875e+00
-1.108269929885864258e+00
-1.488067150115966797e+00
-4.714697599411010742e-01
--2.468206435441970825e-01
--2.778674662113189697e-01
--5.726919770240783691e-01
-7.966566681861877441e-01
-3.259438872337341309e-01
-7.238841056823730469e-01
-1.317236185073852539e+00
--6.427643299102783203e-01
--6.616854071617126465e-01
-3.449333608150482178e-01
-1.523873805999755859e+00
--1.770880818367004395e+00
-4.459496736526489258e-01
--1.308673977851867676e+00
--8.378249406814575195e-01
--7.776624560356140137e-01
--7.166379690170288086e-01
-1.483591556549072266e+00
--1.046773791313171387e+00
--9.184205532073974609e-02
--5.694127678871154785e-01
--7.002854347229003906e-01
--5.811145305633544922e-01
--1.267730951309204102e+00
-1.940409541130065918e+00
-7.551879882812500000e-01
-3.788790851831436157e-02
--1.767819404602050781e+00
-1.966339051723480225e-01
-2.125173091888427734e+00
-4.033783376216888428e-01
--6.449738740921020508e-01
--7.214421778917312622e-02
-1.038697957992553711e+00
--1.720039248466491699e+00
--5.593552589416503906e-01
-6.905189156532287598e-01
-1.693801283836364746e+00
-1.025780200958251953e+00
-1.601356863975524902e-01
-1.841381192207336426e-03
--1.340688228607177734e+00
-7.914224863052368164e-01
--4.111509919166564941e-01
--9.689708948135375977e-01
-9.706826806068420410e-01
-3.221712112426757812e-01
--1.017553806304931641e+00
-6.374475359916687012e-01
--1.567446827888488770e+00
-1.079622745513916016e+00
--6.838436126708984375e-01
--7.464203834533691406e-01
-4.736322760581970215e-01
-7.230627536773681641e-02
--1.091879606246948242e+00
--2.780759036540985107e-01
-6.500254869461059570e-01
--1.413071602582931519e-01
--6.677935123443603516e-01
--5.637246370315551758e-01
-1.807020783424377441e+00
-2.142686128616333008e+00
-4.661364853382110596e-01
--7.062357068061828613e-01
--7.115917205810546875e-01
-1.251373767852783203e+00
--1.802901387214660645e+00
--1.352177619934082031e+00
--3.198754191398620605e-01
-1.498459577560424805e-01
--4.831680059432983398e-01
-7.488607764244079590e-01
-8.024247884750366211e-01
-7.148905396461486816e-01
--1.689905524253845215e-01
--3.437060117721557617e-01
-1.340401619672775269e-01
-1.683871150016784668e+00
-1.002604246139526367e+00
-1.308276414871215820e+00
--7.617053985595703125e-01
--2.677526175975799561e-01
--7.422828674316406250e-01
-5.662541985511779785e-01
--9.786943793296813965e-01
-5.175768136978149414e-01
--2.861405014991760254e-01
-8.294684886932373047e-01
--1.999751329421997070e-01
-1.037881255149841309e+00
-5.464680194854736328e-01
--8.660980463027954102e-01
-4.928737580776214600e-01
--6.311498880386352539e-01
-3.337791562080383301e-01
-9.849458932876586914e-01
--1.106900125741958618e-01
-2.177442312240600586e-01
-1.023627996444702148e+00
-7.414194345474243164e-01
-1.292455196380615234e+00
-6.313494443893432617e-01
-9.998620748519897461e-01
-2.719911038875579834e-01
-2.164029121398925781e+00
-5.713845491409301758e-01
-1.178232431411743164e+00
-1.090514659881591797e-02
--4.316673576831817627e-01
--1.270594716072082520e+00
-5.932700037956237793e-01
--1.272589564323425293e+00
-9.731127023696899414e-01
-9.898380041122436523e-01
--3.958564698696136475e-01
--5.807604193687438965e-01
-5.031570792198181152e-01

prompt_files/flow_prompt_speech_token_1_87.txt DELETED Viewed

@@ -1,87 +0,0 @@
-1520
-4299
-6486
-6486
-6486
-6486
-4299
-2031
-5136
-5405
-537
-5263
-4528
-4862
-146
-1561
-1565
-4795
-5073
-2752
-395
-2927
-5589
-6327
-5023
-4780
-5591
-2687
-1308
-3159
-5832
-5838
-736
-1797
-1882
-758
-3749
-2076
-441
-4970
-2261
-6378
-5661
-5086
-2486
-220
-1107
-3005
-3650
-5348
-2511
-1569
-5106
-1542
-2139
-1695
-1295
-3563
-3805
-5800
-5829
-5831
-707
-572
-5672
-3411
-6075
-3658
-5192
-4543
-5103
-5589
-4943
-527
-860
-3644
-4598
-5049
-5061
-5682
-6486
-6486
-6486
-6486
-6486
-6486
-4299

prompt_files/llm_embedding_1_192.txt DELETED Viewed

@@ -1,192 +0,0 @@
-9.833880662918090820e-01
-3.727950453758239746e-01
--6.032654047012329102e-01
--5.367950201034545898e-01
-1.168318033218383789e+00
--4.463896751403808594e-01
--8.533740043640136719e-02
-6.770751476287841797e-01
-4.399921894073486328e-01
--8.947002887725830078e-02
--4.119307696819305420e-01
--1.400911569595336914e+00
-1.469335317611694336e+00
-7.800692915916442871e-01
-6.252028942108154297e-01
--1.524239063262939453e+00
-4.095870852470397949e-01
-8.922567367553710938e-01
-1.414063215255737305e+00
--3.570723533630371094e-01
-3.816263973712921143e-01
--2.559853792190551758e-01
-9.759899973869323730e-01
--2.347678393125534058e-01
--8.310836553573608398e-01
--1.119347572326660156e+00
-6.822414696216583252e-02
-1.058485746383666992e+00
-2.381889820098876953e-01
--2.013707756996154785e-01
--4.302661716938018799e-01
--1.057960271835327148e+00
-1.127839088439941406e+00
--1.518161177635192871e+00
--5.298921465873718262e-01
--1.788670778274536133e+00
--3.309334218502044678e-01
-1.011094689369201660e+00
--3.399490118026733398e-01
--5.792245864868164062e-01
-3.723595738410949707e-01
--3.795529901981353760e-02
--9.215813875198364258e-01
--2.451439201831817627e-01
--1.136183738708496094e+00
-9.513977169990539551e-02
-7.262014746665954590e-01
--9.598007798194885254e-01
--5.060364603996276855e-01
--2.999072074890136719e-01
--7.779634594917297363e-01
-1.212495565414428711e+00
-3.001802563667297363e-01
--2.383058547973632812e+00
-1.490965783596038818e-01
-5.186975002288818359e-02
-1.555646419525146484e+00
--7.905082702636718750e-01
-6.895875930786132812e-01
--7.865182161331176758e-01
--1.267613649368286133e+00
-5.915310978889465332e-01
--3.206543624401092529e-01
-3.275410532951354980e-01
--7.800404429435729980e-01
-2.810131907463073730e-01
--5.581974983215332031e-02
--6.896089911460876465e-01
--1.699091911315917969e+00
-8.533768653869628906e-01
--1.143321990966796875e+00
-1.108269929885864258e+00
-1.488067150115966797e+00
-4.714697599411010742e-01
--2.468206435441970825e-01
--2.778674662113189697e-01
--5.726919770240783691e-01
-7.966566681861877441e-01
-3.259438872337341309e-01
-7.238841056823730469e-01
-1.317236185073852539e+00
--6.427643299102783203e-01
--6.616854071617126465e-01
-3.449333608150482178e-01
-1.523873805999755859e+00
--1.770880818367004395e+00
-4.459496736526489258e-01
--1.308673977851867676e+00
--8.378249406814575195e-01
--7.776624560356140137e-01
--7.166379690170288086e-01
-1.483591556549072266e+00
--1.046773791313171387e+00
--9.184205532073974609e-02
--5.694127678871154785e-01
--7.002854347229003906e-01
--5.811145305633544922e-01
--1.267730951309204102e+00
-1.940409541130065918e+00
-7.551879882812500000e-01
-3.788790851831436157e-02
--1.767819404602050781e+00
-1.966339051723480225e-01
-2.125173091888427734e+00
-4.033783376216888428e-01
--6.449738740921020508e-01
--7.214421778917312622e-02
-1.038697957992553711e+00
--1.720039248466491699e+00
--5.593552589416503906e-01
-6.905189156532287598e-01
-1.693801283836364746e+00
-1.025780200958251953e+00
-1.601356863975524902e-01
-1.841381192207336426e-03
--1.340688228607177734e+00
-7.914224863052368164e-01
--4.111509919166564941e-01
--9.689708948135375977e-01
-9.706826806068420410e-01
-3.221712112426757812e-01
--1.017553806304931641e+00
-6.374475359916687012e-01
--1.567446827888488770e+00
-1.079622745513916016e+00
--6.838436126708984375e-01
--7.464203834533691406e-01
-4.736322760581970215e-01
-7.230627536773681641e-02
--1.091879606246948242e+00
--2.780759036540985107e-01
-6.500254869461059570e-01
--1.413071602582931519e-01
--6.677935123443603516e-01
--5.637246370315551758e-01
-1.807020783424377441e+00
-2.142686128616333008e+00
-4.661364853382110596e-01
--7.062357068061828613e-01
--7.115917205810546875e-01
-1.251373767852783203e+00
--1.802901387214660645e+00
--1.352177619934082031e+00
--3.198754191398620605e-01
-1.498459577560424805e-01
--4.831680059432983398e-01
-7.488607764244079590e-01
-8.024247884750366211e-01
-7.148905396461486816e-01
--1.689905524253845215e-01
--3.437060117721557617e-01
-1.340401619672775269e-01
-1.683871150016784668e+00
-1.002604246139526367e+00
-1.308276414871215820e+00
--7.617053985595703125e-01
--2.677526175975799561e-01
--7.422828674316406250e-01
-5.662541985511779785e-01
--9.786943793296813965e-01
-5.175768136978149414e-01
--2.861405014991760254e-01
-8.294684886932373047e-01
--1.999751329421997070e-01
-1.037881255149841309e+00
-5.464680194854736328e-01
--8.660980463027954102e-01
-4.928737580776214600e-01
--6.311498880386352539e-01
-3.337791562080383301e-01
-9.849458932876586914e-01
--1.106900125741958618e-01
-2.177442312240600586e-01
-1.023627996444702148e+00
-7.414194345474243164e-01
-1.292455196380615234e+00
-6.313494443893432617e-01
-9.998620748519897461e-01
-2.719911038875579834e-01
-2.164029121398925781e+00
-5.713845491409301758e-01
-1.178232431411743164e+00
-1.090514659881591797e-02
--4.316673576831817627e-01
--1.270594716072082520e+00
-5.932700037956237793e-01
--1.272589564323425293e+00
-9.731127023696899414e-01
-9.898380041122436523e-01
--3.958564698696136475e-01
--5.807604193687438965e-01
-5.031570792198181152e-01

prompt_files/llm_prompt_speech_token_1_87.txt DELETED Viewed

@@ -1,87 +0,0 @@
-1520
-4299
-6486
-6486
-6486
-6486
-4299
-2031
-5136
-5405
-537
-5263
-4528
-4862
-146
-1561
-1565
-4795
-5073
-2752
-395
-2927
-5589
-6327
-5023
-4780
-5591
-2687
-1308
-3159
-5832
-5838
-736
-1797
-1882
-758
-3749
-2076
-441
-4970
-2261
-6378
-5661
-5086
-2486
-220
-1107
-3005
-3650
-5348
-2511
-1569
-5106
-1542
-2139
-1695
-1295
-3563
-3805
-5800
-5829
-5831
-707
-572
-5672
-3411
-6075
-3658
-5192
-4543
-5103
-5589
-4943
-527
-860
-3644
-4598
-5049
-5061
-5682
-6486
-6486
-6486
-6486
-6486
-6486
-4299

prompt_files/prompt_speech_feat_1_174_80.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

prompt_files/prompt_text_1_15.txt DELETED Viewed

@@ -1,15 +0,0 @@
-99658
-99317
-56568
-23031
-33447
-26232
-99521
-99190
-9370
-56006
-35946
-97706
-52801
-119024
-1773

prompt_files/text_1_38.txt DELETED Viewed

@@ -1,38 +0,0 @@
-101956
-16530
-88970
-99789
-99469
-53930
-52510
-35727
-17447
-36407
-3837
-100524
-88653
-26939
-55135
-16530
-58364
-18397
-1773
-101956
-16530
-88970
-44636
-99911
-30858
-100811
-100302
-99243
-28291
-3837
-99816
-29524
-99467
-99691
-109803
-12857
-100167
-1773

run.sh CHANGED Viewed

@@ -5,7 +5,7 @@ rm output*.wav
 ./main_ax650 \
 --template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
 --token2wav_axmodel_dir $TOKEN2WAV_DIR \
---n_timesteps 6 \
 --axmodel_num 24 \
 --bos 0 --eos 0 \
 --filename_tokenizer_model "http://127.0.0.1:12345" \
@@ -15,7 +15,8 @@ rm output*.wav
 --filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
 --filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
 --continue 0 \
---text "君不见黄河之水天上来，奔流到海不复回。君不见高堂明镜悲白发，朝如青丝暮成雪。"
 chmod 777 output*.wav

 ./main_ax650 \
 --template_filename_axmodel "${LLM_DIR}/qwen2_p128_l%d_together.axmodel" \
 --token2wav_axmodel_dir $TOKEN2WAV_DIR \
+--n_timesteps 10 \
 --axmodel_num 24 \
 --bos 0 --eos 0 \
 --filename_tokenizer_model "http://127.0.0.1:12345" \
 --filename_llm_embed "${LLM_DIR}/llm.llm_embedding.float16.bin" \
 --filename_speech_embed "${LLM_DIR}/llm.speech_embedding.float16.bin" \
 --continue 0 \
+--prompt_files en_woman1 \
+--text "Because he has zero capacity to respond to the two and a half hour"
 chmod 777 output*.wav

scripts/frontend.py CHANGED Viewed

@@ -32,7 +32,7 @@ try:
     import ttsfrd
     use_ttsfrd = True
 except ImportError:
-    print("failed to import ttsfrd, use wetext instead")
     from wetext import Normalizer as ZhNormalizer
     from wetext import Normalizer as EnNormalizer
     use_ttsfrd = False
@@ -192,6 +192,27 @@ class CosyVoiceFrontEnd:
         model_input['text'] = tts_text_token
         model_input['text_len'] = tts_text_token_len
         return model_input
     def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)

     import ttsfrd
     use_ttsfrd = True
 except ImportError:
     from wetext import Normalizer as ZhNormalizer
     from wetext import Normalizer as EnNormalizer
     use_ttsfrd = False
         model_input['text'] = tts_text_token
         model_input['text_len'] = tts_text_token_len
         return model_input
+    def process_prompt(self, prompt_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
+        if zero_shot_spk_id == '':
+            prompt_text_token, prompt_text_token_len = self._extract_text_token(prompt_text)
+            prompt_speech_resample = torchaudio.transforms.Resample(orig_freq=16000, new_freq=resample_rate)(prompt_speech_16k)
+            speech_feat, speech_feat_len = self._extract_speech_feat(prompt_speech_resample)
+            speech_token, speech_token_len = self._extract_speech_token(prompt_speech_16k)
+            if resample_rate == 24000:
+                # cosyvoice2, force speech_feat % speech_token = 2
+                token_len = min(int(speech_feat.shape[1] / 2), speech_token.shape[1])
+                speech_feat, speech_feat_len[:] = speech_feat[:, :2 * token_len], 2 * token_len
+                speech_token, speech_token_len[:] = speech_token[:, :token_len], token_len
+            embedding = self._extract_spk_embedding(prompt_speech_16k)
+            model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
+                           'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
+                           'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
+                           'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
+                           'llm_embedding': embedding, 'flow_embedding': embedding}
+        else:
+            model_input = self.spk2info[zero_shot_spk_id]
+        return model_input
     def frontend_cross_lingual(self, tts_text, prompt_speech_16k, resample_rate, zero_shot_spk_id):
         model_input = self.frontend_zero_shot(tts_text, '', prompt_speech_16k, resample_rate, zero_shot_spk_id)

scripts/process_prompt.py CHANGED Viewed

@@ -1,4 +1,5 @@
 import argparse
 import torch
 import torchaudio
 import numpy as np
@@ -15,15 +16,15 @@ def load_wav(wav, target_sr):
 if __name__ == "__main__":
     args = argparse.ArgumentParser()
-    args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN")
-    args.add_argument('--wetext_dir', type=str, default="pengzhendong/wetext")
-    args.add_argument('--sample_rate', type=int, default=24000)
-    args.add_argument('--zero_shot_spk_id', type=str, default="")
-    args.add_argument('--tts_text', type=str, default="君不见黄河之水天上来，奔流到海不复回。君不见高堂明镜悲白发，朝如青丝暮成雪。")
-    args.add_argument('--prompt_text', type=str, default="希望你以后能够做的比我还好呦。")
-    args.add_argument('--prompt_speech', type=str, default="asset/zero_shot_prompt.wav")
     args = args.parse_args()
     frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
                                 args.wetext_dir,
@@ -33,27 +34,29 @@ if __name__ == "__main__":
                                 "all")
     prompt_speech_16k = load_wav(args.prompt_speech, 16000)
-    model_input = frontend.frontend_zero_shot(args.tts_text, args.prompt_text, prompt_speech_16k, args.sample_rate, args.zero_shot_spk_id)
     # model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
     #                        'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
     #                        'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
     #                        'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
     #                        'llm_embedding': embedding, 'flow_embedding': embedding}
     for k, v in model_input.items():
         if "_len" in k:
             continue
         shapes = [str(s) for s in v.shape]
         shape_str = "_".join(shapes)
         if v.dtype in (torch.int32, torch.int64):
-            np.savetxt(f"{k}_{shape_str}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
         else:
-            np.savetxt(f"{k}_{shape_str}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")
-    rand_noise = torch.randn([1, 80,  300])
-    np.savetxt("rand_noise_1_80_300.txt", rand_noise.numpy().reshape(-1), delimiter=",")
-    speech_window = np.hamming(2 * 8 * 480)
-    np.savetxt("speech_window_2x8x480.txt", speech_window.reshape(-1), delimiter=",")

 import argparse
+import os
 import torch
 import torchaudio
 import numpy as np
 if __name__ == "__main__":
     args = argparse.ArgumentParser()
+    args.add_argument('--model_dir', type=str, default="scripts/CosyVoice-BlankEN", help="tokenizer configuration directionary")
+    args.add_argument('--wetext_dir', type=str, default="pengzhendong/wetext", help="path to wetext")
+    args.add_argument('--sample_rate', type=int, default=24000, help="Sampling rate for prompt audio")
+    args.add_argument('--prompt_text', type=str, default="希望你以后能够做的比我还好呦。", help="The text content of the prompt(reference) audio. Text or file path.")
+    args.add_argument('--prompt_speech', type=str, default="asset/zero_shot_prompt.wav", help="The path to prompt(reference) audio.")
+    args.add_argument('--output', type=str, default="prompt_files", help="Output data storage directory")
     args = args.parse_args()
+    os.makedirs(args.output, exist_ok=True)
     frontend = CosyVoiceFrontEnd(f"{args.model_dir}",
                                 args.wetext_dir,
                                 "all")
     prompt_speech_16k = load_wav(args.prompt_speech, 16000)
+    zero_shot_spk_id = ""
+    if os.path.isfile(args.prompt_text):
+        with open(args.prompt_text, "r") as f:
+            prompt_text = f.read()
+    else:
+        prompt_text = args.prompt_text
+    print("prompt_text",prompt_text)
+    model_input = frontend.process_prompt( prompt_text, prompt_speech_16k, args.sample_rate, zero_shot_spk_id)
     # model_input = {'prompt_text': prompt_text_token, 'prompt_text_len': prompt_text_token_len,
     #                        'llm_prompt_speech_token': speech_token, 'llm_prompt_speech_token_len': speech_token_len,
     #                        'flow_prompt_speech_token': speech_token, 'flow_prompt_speech_token_len': speech_token_len,
     #                        'prompt_speech_feat': speech_feat, 'prompt_speech_feat_len': speech_feat_len,
     #                        'llm_embedding': embedding, 'flow_embedding': embedding}
+    print("prompt speech token size:", model_input["flow_prompt_speech_token"].shape)
+    assert model_input["flow_prompt_speech_token"].shape[1] >=75, f"speech_token length should >= 75, bug get {model_input['flow_prompt_speech_token'].shape[1]}"
     for k, v in model_input.items():
         if "_len" in k:
             continue
         shapes = [str(s) for s in v.shape]
         shape_str = "_".join(shapes)
         if v.dtype in (torch.int32, torch.int64):
+            np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), fmt="%d", delimiter=",")
         else:
+            np.savetxt(f"{args.output}/{k}.txt", v.detach().cpu().numpy().reshape(-1), delimiter=",")

{prompt_files → token2wav-axmodels}/rand_noise_1_80_300.txt RENAMED Viewed

File without changes

{prompt_files → token2wav-axmodels}/speech_window_2x8x480.txt RENAMED Viewed

File without changes