{ "architectures": [ "HymbaForCausalLM" ], "attention_dropout": 0.0, "attn_factor": 0.5, "attn_hidden_size": 1024, "attn_implementation": "flash_attention_2", "attn_implementation_new": "flash_attention_2", "attn_only_wo_proj": true, "auto_map": { "AutoConfig": "configuration_hymba.HymbaConfig", "AutoModelForCausalLM": "modeling_hymba.HymbaForCausalLM" }, "bos_token_id": 128000, "calc_logits_for_entire_prompt": false, "conv_dim": { "0": 256, "1": 256, "10": 256, "11": 256, "12": 256, "13": 256, "14": 256, "15": 256, "16": 3200, "17": 3200, "18": 3200, "19": 3200, "2": 256, "20": 3200, "21": 3200, "22": 3200, "23": 3200, "24": 3200, "25": 3200, "26": 3200, "27": 3200, "28": 3200, "29": 3200, "3": 256, "30": 3200, "31": 3200, "4": 256, "5": 256, "6": 256, "7": 256, "8": 256, "9": 256 }, "eos_token_id": 128001, "global_attn_idx": [ 0, 8, 15 ], "head_dim": 64, "hidden_act": "silu", "hidden_size": 2048, "initializer_range": 0.02, "intermediate_size": 8192, "kq_head_dim": -1, "kq_norm": "none", "kv_reuse_every_i_layer": -1, "kv_reuse_group": null, "kv_weight_reuse": false, "layer_type": [ "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h", "h" ], "mamba_conv_bias": true, "mamba_d_conv": 4, "mamba_d_state": 16, "mamba_dt_rank": 100, "mamba_expand": 1, "mamba_inner_layernorms": false, "mamba_proj_bias": false, "max_position_embeddings": 131072, "memory_tokens_interspersed_every": 0, "mlp_hidden_act": "silu", "model_type": "hymba", "num_attention_heads": 16, "num_experts": 1, "num_experts_per_tok": 1, "num_hidden_layers": 16, "num_key_value_heads": 4, "num_mamba": 1, "num_memory_tokens": 0, "orig_max_position_embeddings": 2048, "output_router_logits": false, "pad_token_id": null, "pure_attn": false, "repeat_x_before_conv": true, "rms_norm_eps": 1e-05, "rope": true, "rope_scaling": { "factor": 32.0, "high_freq_factor": 4.0, "low_freq_factor": 1.0, "original_max_position_embeddings": 8192, "rope_type": "llama3" }, "rope_theta": 500000.0, "rope_type": "ntk", "router_aux_loss_coef": 0.001, "seq_length": 8192, "sliding_window": 1024, "tie_word_embeddings": true, "torch_dtype": "bfloat16", "transformers_version": "4.53.0", "use_cache": false, "use_mamba_kernels": true, "v_head_dim": 64, "vocab_size": 128256 }