Upload new model: vn-voice-l40s
Browse files- .gitattributes +1 -0
- vn-voice-l40s/4.wav +3 -0
- vn-voice-l40s/config.yaml +80 -0
- vn-voice-l40s/epoch_00049.pth +3 -0
.gitattributes
CHANGED
|
@@ -47,3 +47,4 @@ ellie-v3/generation_11.wav filter=lfs diff=lfs merge=lfs -text
|
|
| 47 |
ember-v2/generation_4.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
ember-v2/generation_9.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
vn-voice-h200/4.wav filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 47 |
ember-v2/generation_4.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
ember-v2/generation_9.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
vn-voice-h200/4.wav filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
vn-voice-l40s/4.wav filter=lfs diff=lfs merge=lfs -text
|
vn-voice-l40s/4.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6271035391e2193ff0760a5e340ebf46c255541585da2c10420672f5ad337757
|
| 3 |
+
size 927356
|
vn-voice-l40s/config.yaml
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
log_dir: ./Models/Finetune/vi—voice
|
| 2 |
+
save_freq: 5
|
| 3 |
+
log_interval: 10
|
| 4 |
+
device: cuda
|
| 5 |
+
epochs: 50
|
| 6 |
+
batch_size: 4
|
| 7 |
+
max_len: 310 # maximum number of frames
|
| 8 |
+
pretrained_model: ./Models/Finetune/base_model_120k_vi.pth
|
| 9 |
+
load_only_params: true # set to true if do not want to load epoch numbers and optimizer parameters
|
| 10 |
+
debug: true
|
| 11 |
+
|
| 12 |
+
data_params:
|
| 13 |
+
train_data: "./Data/train_list.txt"
|
| 14 |
+
val_data: "./Data/val_list.txt"
|
| 15 |
+
root_path: "./Data/24khz/"
|
| 16 |
+
|
| 17 |
+
symbol: #Total 189 symbols
|
| 18 |
+
pad: "$"
|
| 19 |
+
punctuation: ';:,.!?¡¿—…"«»“” '
|
| 20 |
+
letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
|
| 21 |
+
letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ"
|
| 22 |
+
extend: "∫̆ăη͡123456" #ADD MORE SYMBOLS HERE
|
| 23 |
+
|
| 24 |
+
preprocess_params:
|
| 25 |
+
sr: 24000
|
| 26 |
+
spect_params:
|
| 27 |
+
n_fft: 2048
|
| 28 |
+
win_length: 1200
|
| 29 |
+
hop_length: 300
|
| 30 |
+
|
| 31 |
+
training_strats:
|
| 32 |
+
#All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd'
|
| 33 |
+
freeze_modules: [''] # Not updated when training.
|
| 34 |
+
ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them.
|
| 35 |
+
|
| 36 |
+
model_params:
|
| 37 |
+
dim_in: 64
|
| 38 |
+
hidden_dim: 512
|
| 39 |
+
max_conv_dim: 512
|
| 40 |
+
n_layer: 3
|
| 41 |
+
n_mels: 80
|
| 42 |
+
max_dur: 50 # maximum duration of a single phoneme
|
| 43 |
+
style_dim: 128 # style vector size
|
| 44 |
+
|
| 45 |
+
dropout: 0.2
|
| 46 |
+
|
| 47 |
+
ASR_params:
|
| 48 |
+
input_dim: 80
|
| 49 |
+
hidden_dim: 256
|
| 50 |
+
n_layers: 6
|
| 51 |
+
token_embedding_dim: 512
|
| 52 |
+
|
| 53 |
+
JDC_params:
|
| 54 |
+
num_class: 1
|
| 55 |
+
seq_len: 192
|
| 56 |
+
|
| 57 |
+
# config for decoder
|
| 58 |
+
decoder:
|
| 59 |
+
type: 'hifigan' # either hifigan or istftnet
|
| 60 |
+
resblock_kernel_sizes: [3,7,11]
|
| 61 |
+
upsample_rates : [10,5,3,2]
|
| 62 |
+
upsample_initial_channel: 512
|
| 63 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
| 64 |
+
upsample_kernel_sizes: [20,10,6,4]
|
| 65 |
+
|
| 66 |
+
loss_params:
|
| 67 |
+
lambda_mel: 5. # mel reconstruction loss
|
| 68 |
+
lambda_gen: 1. # generator loss
|
| 69 |
+
|
| 70 |
+
lambda_mono: 1. # monotonic alignment loss (TMA)
|
| 71 |
+
lambda_s2s: 1. # sequence—to—sequence loss (TMA)
|
| 72 |
+
|
| 73 |
+
lambda_F0: 1. # F0 reconstruction loss
|
| 74 |
+
lambda_norm: 1. # norm reconstruction loss
|
| 75 |
+
lambda_dur: 1. # duration loss
|
| 76 |
+
lambda_ce: 20. # duration predictor probability output CE loss
|
| 77 |
+
|
| 78 |
+
optimizer_params:
|
| 79 |
+
lr: 0.0001 # general learning rate
|
| 80 |
+
ft_lr: 0.00001 # learning rate for acoustic modules
|
vn-voice-l40s/epoch_00049.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:85073a78d2b2cecb5bdf28c89203568947cf87f2b5939abf7364e31f9b8251e3
|
| 3 |
+
size 1692227939
|