| { | |
| "architectures": [ | |
| "AprielHForCausalLM" | |
| ], | |
| "attention_dropout": 0.0, | |
| "auto_map": { | |
| "AutoConfig": "configuration_apriel_h.AprielHConfig", | |
| "AutoModel": "modeling_apriel_h.AprielHModel", | |
| "AutoModelForCausalLM": "modeling_apriel_h.AprielHForCausalLM" | |
| }, | |
| "bos_token_id": 1, | |
| "eos_token_id": 2, | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 5120, | |
| "hybrid_block_layout": [ | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "t", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "t", | |
| "m2", | |
| "t", | |
| "t", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "t", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "m2", | |
| "t", | |
| "m2" | |
| ], | |
| "initializer_range": 0.02, | |
| "intermediate_size": 14336, | |
| "max_position_embeddings": 65536, | |
| "model_type": "apriel_h", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 50, | |
| "num_key_value_heads": 8, | |
| "rms_norm_eps": 1e-05, | |
| "rope_scaling": { | |
| "rope_type": "default" | |
| }, | |
| "rope_theta": 1000000.0, | |
| "sliding_window": null, | |
| "ssm_cfg": { | |
| "activation": "silu", | |
| "bias": false, | |
| "chunk_size": 128, | |
| "conv_bias": true, | |
| "d_conv": 4, | |
| "d_inner": 4096, | |
| "d_state": 16, | |
| "d_xb": 1024, | |
| "dt_init": "random", | |
| "dt_init_floor": 0.0001, | |
| "dt_max": 0.1, | |
| "dt_min": 0.001, | |
| "dt_rank": 320, | |
| "dt_scale": 1.0, | |
| "expand": 1, | |
| "n_qk_heads": 32, | |
| "n_v_heads": 32 | |
| }, | |
| "tie_word_embeddings": false, | |
| "transformers_version": "4.53.2", | |
| "use_cache": true, | |
| "vocab_size": 131072 | |
| } | |