| { | |
| "mulan": { | |
| "sr": 24000, | |
| "clip_secs": 10, | |
| "dim_latent": 512, | |
| "decoupled_contrastive_learning": true, | |
| "hierarchical_contrastive_loss": false, | |
| "hierarchical_contrastive_loss_layers": null, | |
| "sigmoid_contrastive_loss": false, | |
| "rank_contrast": true | |
| }, | |
| "audio_model": { | |
| "name": "OpenMuQ/MuQ-large-msd-iter", | |
| "model_dim": 1024, | |
| "use_layer_idx": -1 | |
| }, | |
| "text_model": { | |
| "name": "xlm-roberta-base", | |
| "model_dim": null, | |
| "use_layer_idx": -1 | |
| }, | |
| "audio_transformer": { | |
| "dim": 768, | |
| "tf_depth": 0, | |
| "heads": 8, | |
| "dim_head": 64, | |
| "attn_dropout": 0, | |
| "ff_dropout": 0, | |
| "ff_mult": 4 | |
| }, | |
| "text_transformer": { | |
| "dim": 768, | |
| "tf_depth": 8, | |
| "max_seq_len": 1024, | |
| "dim_head": 64, | |
| "heads": 8, | |
| "attn_dropout": 0, | |
| "ff_dropout": 0, | |
| "ff_mult": 4 | |
| } | |
| } |