{ "metadata": { "fast_llm_metadata": { "fast_llm_version": "0.2.0", "model": "hybrid_ssm", "format": "apriel_ssm_thinker_hybrid", "config": { "type": "hybrid_ssm", "base_model": { "transformer": { "type": "lm_decoder", "normalization": { "type": "rms_norm", "epsilon": 1e-05 }, "rotary": { "type": "default", "theta": 1000000.0 }, "peft": { "type": "none" }, "num_layers": 50, "hidden_size": 5120, "num_attention_heads": 32, "head_groups": 8, "add_linear_biases": false, "ffn_hidden_size": 14336, "kv_channels": 128, "gated": true, "activation_type": "silu", "mlp_lr_scale": 1.0, "attention_lr_scale": 1.0 }, "vision_encoder": { "transformer": { "normalization": { "type": "layer_norm" }, "rotary": { "type": "none" }, "peft": { "type": "none" } }, "patch_norm": { "type": "layer_norm" } }, "vocab_size": 131072, "use_position_embeddings": false, "tie_word_embeddings": false, "cross_entropy_impl": "fused", "distillation_loss_implementation": "reverse_kl", "distillation_model": "teacher", "parallel_embeddings": false, "embeddings_lr_scale": 1.0, "output_lr_scale": 1.0, "ssm": { "normalization": { "type": "layer_norm" }, "expansion_factor": 1, "state_size": 16, "conv_kernel_dimension": 4, "dt_rank": 320, "n_qk_heads": 32, "n_v_heads": 32, "d_inner": 4096, "d_xb": 1024, "add_bias_linear": false, "activation_type": "silu", "chunk_size": 128, "dt_init": "random", "dt_scale": 1.0, "dt_min": 0.001, "dt_max": 0.1, "dt_init_floor": 0.0001 }, "hybrid_block_layout": [ "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "m2", "m2", "m2", "m2", "m2", "m2", "t", "m2", "t", "t", "m2", "m2", "m2", "m2", "t", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "t", "m2" ] }, "multi_stage": { "zero_stage": 3 }, "distributed": { "tensor_parallel": 8, "sequence_tensor_parallel": true, "world_size": 64, "rank": 0, "local_world_size": 8, "timeout": 3600.0, "seed": 984060, "training_dtype": "bfloat16" } }, "shards": [ "weights" ], "metadata": { "optimizer": { "current_step": 25000, "grad_scaler": { "type": "NoopGradScaler" } }, "completed_steps": 25000, "metrics": { "Training": { "train_iters": 60000, "batch_size": 64, "iteration": 25000, "distillation_loss": 0.03285701833665371, "language_model_loss": 0.03285701833665371, "consumed_samples": 1600000, "consumed_tokens": 26214400000, "step_time_ms": 13884.377572964877, "step_time_average_ms": 14831.5291135074, "remaining_time": 519103.518972759, "completion_time": 1756313872.9406276, "percent_done": 41.666666666666664, "skipped_iters": 0, "nan_iters": 0, "model_tflops": 125.02209574963935, "hardware_tflops": 128.9816237227079, "tokens_per_sec_per_gpu": 1180.0312915648658, "run": 19, "grad_norm": 0.3009010851383209, "learning_rate": 2.968135593220339e-06, "loss_scale": 1.0, "reserved": 54666.0, "allocated": 17795.271484375, "max_allocated": 47830.01416015625, "max_reserved": 54666.0, "global_max_reserved": 54666.0 } } } }, "model_config": { "model_type": "apriel_ssm_thinker_hybrid", "architectures": [ "AprielThinkerSSMHybridForCausalLM" ], "rope_theta": 1000000.0, "hidden_act": "silu", "num_hidden_layers": 50, "hidden_size": 5120, "num_attention_heads": 32, "num_key_value_heads": 8, "intermediate_size": 14336, "vocab_size": 131072, "tie_word_embeddings": false, "rms_norm_eps": 1e-05, "head_dim": 128, "rope_scaling": { "rope_type": "default" }, "ssm_cfg": { "d_state": 16, "n_v_heads": 32, "n_qk_heads": 32, "expand": 1, "chunk_size": 128, "bias": false, "activation": "silu", "dt_rank": 320, "dt_min": 0.001, "dt_max": 0.1, "dt_init_floor": 0.0001, "dt_scale": 1.0, "d_xb": 1024, "d_conv": 4, "dt_init": "random", "d_inner": 4096, "conv_bias": true }, "hybrid_block_layout": [ "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "t", "m2", "m2", "m2", "m2", "m2", "m2", "t", "m2", "t", "t", "m2", "m2", "m2", "m2", "t", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "m2", "t", "m2" ], "auto_map": { "AutoConfig": "configuration_ssm_hybrid_apriel15b.AprielSSMHybridConfig", "AutoModel": "modeling_ssm_hybrid_apriel15b.AprielThinkerSSMHybridModel", "AutoModelForCausalLM": "modeling_ssm_hybrid_apriel15b.AprielThinkerSSMHybridForCausalLM" }, "attn_implementation": null }, "format": "pt" }, "weight_map": { "model.embed_tokens.weight": "model_0.safetensors", "model.layers.0.input_layernorm.weight": "model_0.safetensors", "model.layers.0.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.0.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.0.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.0.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.0.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.0.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.0.mlp.up_proj.weight": "model_0.safetensors", "model.layers.0.mlp.down_proj.weight": "model_0.safetensors", "model.layers.1.input_layernorm.weight": "model_0.safetensors", "model.layers.1.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.1.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.1.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.1.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.1.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.1.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.1.mlp.up_proj.weight": "model_0.safetensors", "model.layers.1.mlp.down_proj.weight": "model_0.safetensors", "model.layers.2.input_layernorm.weight": "model_0.safetensors", "model.layers.2.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.2.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.2.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.2.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.2.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.2.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.2.mlp.up_proj.weight": "model_0.safetensors", "model.layers.2.mlp.down_proj.weight": "model_0.safetensors", "model.layers.3.input_layernorm.weight": "model_0.safetensors", "model.layers.3.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.3.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.3.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.3.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.3.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.3.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.3.mlp.up_proj.weight": "model_0.safetensors", "model.layers.3.mlp.down_proj.weight": "model_0.safetensors", "model.layers.4.input_layernorm.weight": "model_0.safetensors", "model.layers.4.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.4.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.4.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.4.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.4.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.4.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.4.mlp.up_proj.weight": "model_0.safetensors", "model.layers.4.mlp.down_proj.weight": "model_0.safetensors", "model.layers.5.input_layernorm.weight": "model_0.safetensors", "model.layers.5.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.5.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.5.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.5.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.5.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.5.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.5.mlp.up_proj.weight": "model_0.safetensors", "model.layers.5.mlp.down_proj.weight": "model_0.safetensors", "model.layers.6.input_layernorm.weight": "model_0.safetensors", "model.layers.6.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.6.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.6.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.6.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.6.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.6.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.6.mlp.up_proj.weight": "model_0.safetensors", "model.layers.6.mlp.down_proj.weight": "model_0.safetensors", "model.layers.7.input_layernorm.weight": "model_0.safetensors", "model.layers.7.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.7.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.7.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.7.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.7.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.7.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.7.mlp.up_proj.weight": "model_0.safetensors", "model.layers.7.mlp.down_proj.weight": "model_0.safetensors", "model.layers.8.input_layernorm.weight": "model_0.safetensors", "model.layers.8.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.8.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.8.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.8.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.8.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.8.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.8.mlp.up_proj.weight": "model_0.safetensors", "model.layers.8.mlp.down_proj.weight": "model_0.safetensors", "model.layers.9.input_layernorm.weight": "model_0.safetensors", "model.layers.9.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.9.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.9.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.9.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.9.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.9.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.9.mlp.up_proj.weight": "model_0.safetensors", "model.layers.9.mlp.down_proj.weight": "model_0.safetensors", "model.layers.10.input_layernorm.weight": "model_0.safetensors", "model.layers.10.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.10.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.10.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.10.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.10.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.10.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.10.mlp.up_proj.weight": "model_0.safetensors", "model.layers.10.mlp.down_proj.weight": "model_0.safetensors", "model.layers.11.input_layernorm.weight": "model_0.safetensors", "model.layers.11.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.11.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.11.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.11.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.11.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.11.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.11.mlp.up_proj.weight": "model_0.safetensors", "model.layers.11.mlp.down_proj.weight": "model_0.safetensors", "model.layers.12.input_layernorm.weight": "model_0.safetensors", "model.layers.12.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.12.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.12.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.12.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.12.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.12.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.12.mlp.up_proj.weight": "model_0.safetensors", "model.layers.12.mlp.down_proj.weight": "model_0.safetensors", "model.layers.13.input_layernorm.weight": "model_0.safetensors", "model.layers.13.post_attention_layernorm.weight": "model_0.safetensors", "model.layers.13.self_attn.q_proj.weight": "model_0.safetensors", "model.layers.13.self_attn.k_proj.weight": "model_0.safetensors", "model.layers.13.self_attn.v_proj.weight": "model_0.safetensors", "model.layers.13.self_attn.o_proj.weight": "model_0.safetensors", "model.layers.13.mlp.gate_proj.weight": "model_0.safetensors", "model.layers.13.mlp.up_proj.weight": "model_1.safetensors", "model.layers.13.mlp.down_proj.weight": "model_1.safetensors", "model.layers.14.input_layernorm.weight": "model_1.safetensors", "model.layers.14.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.14.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.14.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.14.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.14.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.14.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.14.mlp.up_proj.weight": "model_1.safetensors", "model.layers.14.mlp.down_proj.weight": "model_1.safetensors", "model.layers.15.input_layernorm.weight": "model_1.safetensors", "model.layers.15.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.15.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.15.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.15.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.15.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.15.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.15.mlp.up_proj.weight": "model_1.safetensors", "model.layers.15.mlp.down_proj.weight": "model_1.safetensors", "model.layers.16.input_layernorm.weight": "model_1.safetensors", "model.layers.16.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.16.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.16.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.16.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.16.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.16.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.16.mlp.up_proj.weight": "model_1.safetensors", "model.layers.16.mlp.down_proj.weight": "model_1.safetensors", "model.layers.17.input_layernorm.weight": "model_1.safetensors", "model.layers.17.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.17.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.17.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.17.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.17.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.17.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.17.mlp.up_proj.weight": "model_1.safetensors", "model.layers.17.mlp.down_proj.weight": "model_1.safetensors", "model.layers.18.input_layernorm.weight": "model_1.safetensors", "model.layers.18.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.18.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.18.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.18.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.18.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.18.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.18.mlp.up_proj.weight": "model_1.safetensors", "model.layers.18.mlp.down_proj.weight": "model_1.safetensors", "model.layers.19.input_layernorm.weight": "model_1.safetensors", "model.layers.19.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.19.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.19.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.19.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.19.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.19.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.19.mlp.up_proj.weight": "model_1.safetensors", "model.layers.19.mlp.down_proj.weight": "model_1.safetensors", "model.layers.20.mixer.A_log": "model_1.safetensors", "model.layers.20.mixer.D": "model_1.safetensors", "model.layers.20.input_layernorm.weight": "model_1.safetensors", "model.layers.20.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.20.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.20.mixer.conv1d.weight": "model_1.safetensors", "model.layers.20.mixer.conv1d.bias": "model_1.safetensors", "model.layers.20.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.20.mixer.in_proj.weight": "model_1.safetensors", "model.layers.20.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.20.mixer.out_proj.weight": "model_1.safetensors", "model.layers.20.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.20.mlp.up_proj.weight": "model_1.safetensors", "model.layers.20.mlp.down_proj.weight": "model_1.safetensors", "model.layers.21.mixer.A_log": "model_1.safetensors", "model.layers.21.mixer.D": "model_1.safetensors", "model.layers.21.input_layernorm.weight": "model_1.safetensors", "model.layers.21.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.21.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.21.mixer.conv1d.weight": "model_1.safetensors", "model.layers.21.mixer.conv1d.bias": "model_1.safetensors", "model.layers.21.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.21.mixer.in_proj.weight": "model_1.safetensors", "model.layers.21.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.21.mixer.out_proj.weight": "model_1.safetensors", "model.layers.21.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.21.mlp.up_proj.weight": "model_1.safetensors", "model.layers.21.mlp.down_proj.weight": "model_1.safetensors", "model.layers.22.mixer.A_log": "model_1.safetensors", "model.layers.22.mixer.D": "model_1.safetensors", "model.layers.22.input_layernorm.weight": "model_1.safetensors", "model.layers.22.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.22.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.22.mixer.conv1d.weight": "model_1.safetensors", "model.layers.22.mixer.conv1d.bias": "model_1.safetensors", "model.layers.22.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.22.mixer.in_proj.weight": "model_1.safetensors", "model.layers.22.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.22.mixer.out_proj.weight": "model_1.safetensors", "model.layers.22.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.22.mlp.up_proj.weight": "model_1.safetensors", "model.layers.22.mlp.down_proj.weight": "model_1.safetensors", "model.layers.23.mixer.A_log": "model_1.safetensors", "model.layers.23.mixer.D": "model_1.safetensors", "model.layers.23.input_layernorm.weight": "model_1.safetensors", "model.layers.23.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.23.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.23.mixer.conv1d.weight": "model_1.safetensors", "model.layers.23.mixer.conv1d.bias": "model_1.safetensors", "model.layers.23.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.23.mixer.in_proj.weight": "model_1.safetensors", "model.layers.23.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.23.mixer.out_proj.weight": "model_1.safetensors", "model.layers.23.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.23.mlp.up_proj.weight": "model_1.safetensors", "model.layers.23.mlp.down_proj.weight": "model_1.safetensors", "model.layers.24.mixer.A_log": "model_1.safetensors", "model.layers.24.mixer.D": "model_1.safetensors", "model.layers.24.input_layernorm.weight": "model_1.safetensors", "model.layers.24.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.24.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.24.mixer.conv1d.weight": "model_1.safetensors", "model.layers.24.mixer.conv1d.bias": "model_1.safetensors", "model.layers.24.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.24.mixer.in_proj.weight": "model_1.safetensors", "model.layers.24.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.24.mixer.out_proj.weight": "model_1.safetensors", "model.layers.24.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.24.mlp.up_proj.weight": "model_1.safetensors", "model.layers.24.mlp.down_proj.weight": "model_1.safetensors", "model.layers.25.mixer.A_log": "model_1.safetensors", "model.layers.25.mixer.D": "model_1.safetensors", "model.layers.25.input_layernorm.weight": "model_1.safetensors", "model.layers.25.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.25.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.25.mixer.conv1d.weight": "model_1.safetensors", "model.layers.25.mixer.conv1d.bias": "model_1.safetensors", "model.layers.25.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.25.mixer.in_proj.weight": "model_1.safetensors", "model.layers.25.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.25.mixer.out_proj.weight": "model_1.safetensors", "model.layers.25.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.25.mlp.up_proj.weight": "model_1.safetensors", "model.layers.25.mlp.down_proj.weight": "model_1.safetensors", "model.layers.26.input_layernorm.weight": "model_1.safetensors", "model.layers.26.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.26.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.26.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.26.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.26.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.26.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.26.mlp.up_proj.weight": "model_1.safetensors", "model.layers.26.mlp.down_proj.weight": "model_1.safetensors", "model.layers.27.mixer.A_log": "model_1.safetensors", "model.layers.27.mixer.D": "model_1.safetensors", "model.layers.27.input_layernorm.weight": "model_1.safetensors", "model.layers.27.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.27.mixer.dt_in_proj.weight": "model_1.safetensors", "model.layers.27.mixer.conv1d.weight": "model_1.safetensors", "model.layers.27.mixer.conv1d.bias": "model_1.safetensors", "model.layers.27.mixer.dt_proj.bias": "model_1.safetensors", "model.layers.27.mixer.in_proj.weight": "model_1.safetensors", "model.layers.27.mixer.dt_proj.weight": "model_1.safetensors", "model.layers.27.mixer.out_proj.weight": "model_1.safetensors", "model.layers.27.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.27.mlp.up_proj.weight": "model_1.safetensors", "model.layers.27.mlp.down_proj.weight": "model_1.safetensors", "model.layers.28.input_layernorm.weight": "model_1.safetensors", "model.layers.28.post_attention_layernorm.weight": "model_1.safetensors", "model.layers.28.self_attn.q_proj.weight": "model_1.safetensors", "model.layers.28.self_attn.k_proj.weight": "model_1.safetensors", "model.layers.28.self_attn.v_proj.weight": "model_1.safetensors", "model.layers.28.self_attn.o_proj.weight": "model_1.safetensors", "model.layers.28.mlp.gate_proj.weight": "model_1.safetensors", "model.layers.28.mlp.up_proj.weight": "model_1.safetensors", "model.layers.28.mlp.down_proj.weight": "model_2.safetensors", "model.layers.29.input_layernorm.weight": "model_2.safetensors", "model.layers.29.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.29.self_attn.q_proj.weight": "model_2.safetensors", "model.layers.29.self_attn.k_proj.weight": "model_2.safetensors", "model.layers.29.self_attn.v_proj.weight": "model_2.safetensors", "model.layers.29.self_attn.o_proj.weight": "model_2.safetensors", "model.layers.29.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.29.mlp.up_proj.weight": "model_2.safetensors", "model.layers.29.mlp.down_proj.weight": "model_2.safetensors", "model.layers.30.mixer.A_log": "model_2.safetensors", "model.layers.30.mixer.D": "model_2.safetensors", "model.layers.30.input_layernorm.weight": "model_2.safetensors", "model.layers.30.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.30.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.30.mixer.conv1d.weight": "model_2.safetensors", "model.layers.30.mixer.conv1d.bias": "model_2.safetensors", "model.layers.30.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.30.mixer.in_proj.weight": "model_2.safetensors", "model.layers.30.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.30.mixer.out_proj.weight": "model_2.safetensors", "model.layers.30.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.30.mlp.up_proj.weight": "model_2.safetensors", "model.layers.30.mlp.down_proj.weight": "model_2.safetensors", "model.layers.31.mixer.A_log": "model_2.safetensors", "model.layers.31.mixer.D": "model_2.safetensors", "model.layers.31.input_layernorm.weight": "model_2.safetensors", "model.layers.31.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.31.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.31.mixer.conv1d.weight": "model_2.safetensors", "model.layers.31.mixer.conv1d.bias": "model_2.safetensors", "model.layers.31.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.31.mixer.in_proj.weight": "model_2.safetensors", "model.layers.31.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.31.mixer.out_proj.weight": "model_2.safetensors", "model.layers.31.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.31.mlp.up_proj.weight": "model_2.safetensors", "model.layers.31.mlp.down_proj.weight": "model_2.safetensors", "model.layers.32.mixer.A_log": "model_2.safetensors", "model.layers.32.mixer.D": "model_2.safetensors", "model.layers.32.input_layernorm.weight": "model_2.safetensors", "model.layers.32.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.32.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.32.mixer.conv1d.weight": "model_2.safetensors", "model.layers.32.mixer.conv1d.bias": "model_2.safetensors", "model.layers.32.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.32.mixer.in_proj.weight": "model_2.safetensors", "model.layers.32.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.32.mixer.out_proj.weight": "model_2.safetensors", "model.layers.32.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.32.mlp.up_proj.weight": "model_2.safetensors", "model.layers.32.mlp.down_proj.weight": "model_2.safetensors", "model.layers.33.mixer.A_log": "model_2.safetensors", "model.layers.33.mixer.D": "model_2.safetensors", "model.layers.33.input_layernorm.weight": "model_2.safetensors", "model.layers.33.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.33.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.33.mixer.conv1d.weight": "model_2.safetensors", "model.layers.33.mixer.conv1d.bias": "model_2.safetensors", "model.layers.33.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.33.mixer.in_proj.weight": "model_2.safetensors", "model.layers.33.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.33.mixer.out_proj.weight": "model_2.safetensors", "model.layers.33.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.33.mlp.up_proj.weight": "model_2.safetensors", "model.layers.33.mlp.down_proj.weight": "model_2.safetensors", "model.layers.34.input_layernorm.weight": "model_2.safetensors", "model.layers.34.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.34.self_attn.q_proj.weight": "model_2.safetensors", "model.layers.34.self_attn.k_proj.weight": "model_2.safetensors", "model.layers.34.self_attn.v_proj.weight": "model_2.safetensors", "model.layers.34.self_attn.o_proj.weight": "model_2.safetensors", "model.layers.34.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.34.mlp.up_proj.weight": "model_2.safetensors", "model.layers.34.mlp.down_proj.weight": "model_2.safetensors", "model.layers.35.mixer.A_log": "model_2.safetensors", "model.layers.35.mixer.D": "model_2.safetensors", "model.layers.35.input_layernorm.weight": "model_2.safetensors", "model.layers.35.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.35.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.35.mixer.conv1d.weight": "model_2.safetensors", "model.layers.35.mixer.conv1d.bias": "model_2.safetensors", "model.layers.35.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.35.mixer.in_proj.weight": "model_2.safetensors", "model.layers.35.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.35.mixer.out_proj.weight": "model_2.safetensors", "model.layers.35.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.35.mlp.up_proj.weight": "model_2.safetensors", "model.layers.35.mlp.down_proj.weight": "model_2.safetensors", "model.layers.36.mixer.A_log": "model_2.safetensors", "model.layers.36.mixer.D": "model_2.safetensors", "model.layers.36.input_layernorm.weight": "model_2.safetensors", "model.layers.36.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.36.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.36.mixer.conv1d.weight": "model_2.safetensors", "model.layers.36.mixer.conv1d.bias": "model_2.safetensors", "model.layers.36.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.36.mixer.in_proj.weight": "model_2.safetensors", "model.layers.36.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.36.mixer.out_proj.weight": "model_2.safetensors", "model.layers.36.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.36.mlp.up_proj.weight": "model_2.safetensors", "model.layers.36.mlp.down_proj.weight": "model_2.safetensors", "model.layers.37.mixer.A_log": "model_2.safetensors", "model.layers.37.mixer.D": "model_2.safetensors", "model.layers.37.input_layernorm.weight": "model_2.safetensors", "model.layers.37.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.37.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.37.mixer.conv1d.weight": "model_2.safetensors", "model.layers.37.mixer.conv1d.bias": "model_2.safetensors", "model.layers.37.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.37.mixer.in_proj.weight": "model_2.safetensors", "model.layers.37.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.37.mixer.out_proj.weight": "model_2.safetensors", "model.layers.37.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.37.mlp.up_proj.weight": "model_2.safetensors", "model.layers.37.mlp.down_proj.weight": "model_2.safetensors", "model.layers.38.mixer.A_log": "model_2.safetensors", "model.layers.38.mixer.D": "model_2.safetensors", "model.layers.38.input_layernorm.weight": "model_2.safetensors", "model.layers.38.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.38.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.38.mixer.conv1d.weight": "model_2.safetensors", "model.layers.38.mixer.conv1d.bias": "model_2.safetensors", "model.layers.38.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.38.mixer.in_proj.weight": "model_2.safetensors", "model.layers.38.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.38.mixer.out_proj.weight": "model_2.safetensors", "model.layers.38.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.38.mlp.up_proj.weight": "model_2.safetensors", "model.layers.38.mlp.down_proj.weight": "model_2.safetensors", "model.layers.39.mixer.A_log": "model_2.safetensors", "model.layers.39.mixer.D": "model_2.safetensors", "model.layers.39.input_layernorm.weight": "model_2.safetensors", "model.layers.39.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.39.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.39.mixer.conv1d.weight": "model_2.safetensors", "model.layers.39.mixer.conv1d.bias": "model_2.safetensors", "model.layers.39.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.39.mixer.in_proj.weight": "model_2.safetensors", "model.layers.39.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.39.mixer.out_proj.weight": "model_2.safetensors", "model.layers.39.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.39.mlp.up_proj.weight": "model_2.safetensors", "model.layers.39.mlp.down_proj.weight": "model_2.safetensors", "model.layers.40.mixer.A_log": "model_2.safetensors", "model.layers.40.mixer.D": "model_2.safetensors", "model.layers.40.input_layernorm.weight": "model_2.safetensors", "model.layers.40.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.40.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.40.mixer.conv1d.weight": "model_2.safetensors", "model.layers.40.mixer.conv1d.bias": "model_2.safetensors", "model.layers.40.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.40.mixer.in_proj.weight": "model_2.safetensors", "model.layers.40.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.40.mixer.out_proj.weight": "model_2.safetensors", "model.layers.40.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.40.mlp.up_proj.weight": "model_2.safetensors", "model.layers.40.mlp.down_proj.weight": "model_2.safetensors", "model.layers.41.mixer.A_log": "model_2.safetensors", "model.layers.41.mixer.D": "model_2.safetensors", "model.layers.41.input_layernorm.weight": "model_2.safetensors", "model.layers.41.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.41.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.41.mixer.conv1d.weight": "model_2.safetensors", "model.layers.41.mixer.conv1d.bias": "model_2.safetensors", "model.layers.41.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.41.mixer.in_proj.weight": "model_2.safetensors", "model.layers.41.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.41.mixer.out_proj.weight": "model_2.safetensors", "model.layers.41.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.41.mlp.up_proj.weight": "model_2.safetensors", "model.layers.41.mlp.down_proj.weight": "model_2.safetensors", "model.layers.42.mixer.A_log": "model_2.safetensors", "model.layers.42.mixer.D": "model_2.safetensors", "model.layers.42.input_layernorm.weight": "model_2.safetensors", "model.layers.42.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.42.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.42.mixer.conv1d.weight": "model_2.safetensors", "model.layers.42.mixer.conv1d.bias": "model_2.safetensors", "model.layers.42.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.42.mixer.in_proj.weight": "model_2.safetensors", "model.layers.42.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.42.mixer.out_proj.weight": "model_2.safetensors", "model.layers.42.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.42.mlp.up_proj.weight": "model_2.safetensors", "model.layers.42.mlp.down_proj.weight": "model_2.safetensors", "model.layers.43.mixer.A_log": "model_2.safetensors", "model.layers.43.mixer.D": "model_2.safetensors", "model.layers.43.input_layernorm.weight": "model_2.safetensors", "model.layers.43.post_attention_layernorm.weight": "model_2.safetensors", "model.layers.43.mixer.dt_in_proj.weight": "model_2.safetensors", "model.layers.43.mixer.conv1d.weight": "model_2.safetensors", "model.layers.43.mixer.conv1d.bias": "model_2.safetensors", "model.layers.43.mixer.dt_proj.bias": "model_2.safetensors", "model.layers.43.mixer.in_proj.weight": "model_2.safetensors", "model.layers.43.mixer.dt_proj.weight": "model_2.safetensors", "model.layers.43.mixer.out_proj.weight": "model_2.safetensors", "model.layers.43.mlp.gate_proj.weight": "model_2.safetensors", "model.layers.43.mlp.up_proj.weight": "model_3.safetensors", "model.layers.43.mlp.down_proj.weight": "model_3.safetensors", "model.layers.44.mixer.A_log": "model_3.safetensors", "model.layers.44.mixer.D": "model_3.safetensors", "model.layers.44.input_layernorm.weight": "model_3.safetensors", "model.layers.44.post_attention_layernorm.weight": "model_3.safetensors", "model.layers.44.mixer.dt_in_proj.weight": "model_3.safetensors", "model.layers.44.mixer.conv1d.weight": "model_3.safetensors", "model.layers.44.mixer.conv1d.bias": "model_3.safetensors", "model.layers.44.mixer.dt_proj.bias": "model_3.safetensors", "model.layers.44.mixer.in_proj.weight": "model_3.safetensors", "model.layers.44.mixer.dt_proj.weight": "model_3.safetensors", "model.layers.44.mixer.out_proj.weight": "model_3.safetensors", "model.layers.44.mlp.gate_proj.weight": "model_3.safetensors", "model.layers.44.mlp.up_proj.weight": "model_3.safetensors", "model.layers.44.mlp.down_proj.weight": "model_3.safetensors", "model.layers.45.mixer.A_log": "model_3.safetensors", "model.layers.45.mixer.D": "model_3.safetensors", "model.layers.45.input_layernorm.weight": "model_3.safetensors", "model.layers.45.post_attention_layernorm.weight": "model_3.safetensors", "model.layers.45.mixer.dt_in_proj.weight": "model_3.safetensors", "model.layers.45.mixer.conv1d.weight": "model_3.safetensors", "model.layers.45.mixer.conv1d.bias": "model_3.safetensors", "model.layers.45.mixer.dt_proj.bias": "model_3.safetensors", "model.layers.45.mixer.in_proj.weight": "model_3.safetensors", "model.layers.45.mixer.dt_proj.weight": "model_3.safetensors", "model.layers.45.mixer.out_proj.weight": "model_3.safetensors", "model.layers.45.mlp.gate_proj.weight": "model_3.safetensors", "model.layers.45.mlp.up_proj.weight": "model_3.safetensors", "model.layers.45.mlp.down_proj.weight": "model_3.safetensors", "model.layers.46.mixer.A_log": "model_3.safetensors", "model.layers.46.mixer.D": "model_3.safetensors", "model.layers.46.input_layernorm.weight": "model_3.safetensors", "model.layers.46.post_attention_layernorm.weight": "model_3.safetensors", "model.layers.46.mixer.dt_in_proj.weight": "model_3.safetensors", "model.layers.46.mixer.conv1d.weight": "model_3.safetensors", "model.layers.46.mixer.conv1d.bias": "model_3.safetensors", "model.layers.46.mixer.dt_proj.bias": "model_3.safetensors", "model.layers.46.mixer.in_proj.weight": "model_3.safetensors", "model.layers.46.mixer.dt_proj.weight": "model_3.safetensors", "model.layers.46.mixer.out_proj.weight": "model_3.safetensors", "model.layers.46.mlp.gate_proj.weight": "model_3.safetensors", "model.layers.46.mlp.up_proj.weight": "model_3.safetensors", "model.layers.46.mlp.down_proj.weight": "model_3.safetensors", "model.layers.47.mixer.A_log": "model_3.safetensors", "model.layers.47.mixer.D": "model_3.safetensors", "model.layers.47.input_layernorm.weight": "model_3.safetensors", "model.layers.47.post_attention_layernorm.weight": "model_3.safetensors", "model.layers.47.mixer.dt_in_proj.weight": "model_3.safetensors", "model.layers.47.mixer.conv1d.weight": "model_3.safetensors", "model.layers.47.mixer.conv1d.bias": "model_3.safetensors", "model.layers.47.mixer.dt_proj.bias": "model_3.safetensors", "model.layers.47.mixer.in_proj.weight": "model_3.safetensors", "model.layers.47.mixer.dt_proj.weight": "model_3.safetensors", "model.layers.47.mixer.out_proj.weight": "model_3.safetensors", "model.layers.47.mlp.gate_proj.weight": "model_3.safetensors", "model.layers.47.mlp.up_proj.weight": "model_3.safetensors", "model.layers.47.mlp.down_proj.weight": "model_3.safetensors", "model.layers.48.input_layernorm.weight": "model_3.safetensors", "model.layers.48.post_attention_layernorm.weight": "model_3.safetensors", "model.layers.48.self_attn.q_proj.weight": "model_3.safetensors", "model.layers.48.self_attn.k_proj.weight": "model_3.safetensors", "model.layers.48.self_attn.v_proj.weight": "model_3.safetensors", "model.layers.48.self_attn.o_proj.weight": "model_3.safetensors", "model.layers.48.mlp.gate_proj.weight": "model_3.safetensors", "model.layers.48.mlp.up_proj.weight": "model_3.safetensors", "model.layers.48.mlp.down_proj.weight": "model_3.safetensors", "model.layers.49.mixer.A_log": "model_3.safetensors", "model.layers.49.mixer.D": "model_3.safetensors", "model.layers.49.input_layernorm.weight": "model_3.safetensors", "model.layers.49.post_attention_layernorm.weight": "model_3.safetensors", "model.layers.49.mixer.dt_in_proj.weight": "model_3.safetensors", "model.layers.49.mixer.conv1d.weight": "model_3.safetensors", "model.layers.49.mixer.conv1d.bias": "model_3.safetensors", "model.layers.49.mixer.dt_proj.bias": "model_3.safetensors", "model.layers.49.mixer.in_proj.weight": "model_3.safetensors", "model.layers.49.mixer.dt_proj.weight": "model_3.safetensors", "model.layers.49.mixer.out_proj.weight": "model_3.safetensors", "model.layers.49.mlp.gate_proj.weight": "model_3.safetensors", "model.layers.49.mlp.up_proj.weight": "model_3.safetensors", "model.layers.49.mlp.down_proj.weight": "model_3.safetensors", "model.norm.weight": "model_3.safetensors", "lm_head.weight": "model_3.safetensors" } }