StormblessedKal commited on
Commit
771c7eb
·
verified ·
1 Parent(s): c2e3bb1

Upload new model: vn-voice-l40s

Browse files
.gitattributes CHANGED
@@ -47,3 +47,4 @@ ellie-v3/generation_11.wav filter=lfs diff=lfs merge=lfs -text
47
  ember-v2/generation_4.wav filter=lfs diff=lfs merge=lfs -text
48
  ember-v2/generation_9.wav filter=lfs diff=lfs merge=lfs -text
49
  vn-voice-h200/4.wav filter=lfs diff=lfs merge=lfs -text
 
 
47
  ember-v2/generation_4.wav filter=lfs diff=lfs merge=lfs -text
48
  ember-v2/generation_9.wav filter=lfs diff=lfs merge=lfs -text
49
  vn-voice-h200/4.wav filter=lfs diff=lfs merge=lfs -text
50
+ vn-voice-l40s/4.wav filter=lfs diff=lfs merge=lfs -text
vn-voice-l40s/4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6271035391e2193ff0760a5e340ebf46c255541585da2c10420672f5ad337757
3
+ size 927356
vn-voice-l40s/config.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ log_dir: ./Models/Finetune/vi—voice
2
+ save_freq: 5
3
+ log_interval: 10
4
+ device: cuda
5
+ epochs: 50
6
+ batch_size: 4
7
+ max_len: 310 # maximum number of frames
8
+ pretrained_model: ./Models/Finetune/base_model_120k_vi.pth
9
+ load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
10
+ debug: true
11
+
12
+ data_params:
13
+ train_data: "./Data/train_list.txt"
14
+ val_data: "./Data/val_list.txt"
15
+ root_path: "./Data/24khz/"
16
+
17
+ symbol: #Total 189 symbols
18
+ pad: "$"
19
+ punctuation: ';:,.!?¡¿—…"«»“” '
20
+ letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
21
+ letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
22
+ extend: "∫̆ăη͡123456" #ADD MORE SYMBOLS HERE
23
+
24
+ preprocess_params:
25
+ sr: 24000
26
+ spect_params:
27
+ n_fft: 2048
28
+ win_length: 1200
29
+ hop_length: 300
30
+
31
+ training_strats:
32
+ #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd'
33
+ freeze_modules: [''] # Not updated when training.
34
+ ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them.
35
+
36
+ model_params:
37
+ dim_in: 64
38
+ hidden_dim: 512
39
+ max_conv_dim: 512
40
+ n_layer: 3
41
+ n_mels: 80
42
+ max_dur: 50 # maximum duration of a single phoneme
43
+ style_dim: 128 # style vector size
44
+
45
+ dropout: 0.2
46
+
47
+ ASR_params:
48
+ input_dim: 80
49
+ hidden_dim: 256
50
+ n_layers: 6
51
+ token_embedding_dim: 512
52
+
53
+ JDC_params:
54
+ num_class: 1
55
+ seq_len: 192
56
+
57
+ # config for decoder
58
+ decoder:
59
+ type: 'hifigan' # either hifigan or istftnet
60
+ resblock_kernel_sizes: [3,7,11]
61
+ upsample_rates : [10,5,3,2]
62
+ upsample_initial_channel: 512
63
+ resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
64
+ upsample_kernel_sizes: [20,10,6,4]
65
+
66
+ loss_params:
67
+ lambda_mel: 5. # mel reconstruction loss
68
+ lambda_gen: 1. # generator loss
69
+
70
+ lambda_mono: 1. # monotonic alignment loss (TMA)
71
+ lambda_s2s: 1. # sequence—to—sequence loss (TMA)
72
+
73
+ lambda_F0: 1. # F0 reconstruction loss
74
+ lambda_norm: 1. # norm reconstruction loss
75
+ lambda_dur: 1. # duration loss
76
+ lambda_ce: 20. # duration predictor probability output CE loss
77
+
78
+ optimizer_params:
79
+ lr: 0.0001 # general learning rate
80
+ ft_lr: 0.00001 # learning rate for acoustic modules
vn-voice-l40s/epoch_00049.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:85073a78d2b2cecb5bdf28c89203568947cf87f2b5939abf7364e31f9b8251e3
3
+ size 1692227939