{ "architectures": [ "CacaForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, "attention_pattern": "all_global", "attention_sink_size": 4, "attention_sink_window": 1024, "attn_logit_softcapping": null, "audio_config": {}, "auto_map": { "AutoConfig": "caca_transformers.CacaConfig", "AutoModel": "caca_transformers.CacaModel", "AutoModelForCausalLM": "caca_transformers.CacaForCausalLM" }, "bos_token_id": 1, "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System: {{ message['content'] }}\n{% elif message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}{% if add_generation_prompt %}Assistant:{% endif %}", "cross_attention_frequency": 4, "dtype": "float32", "eos_token_id": 2, "expert_capacity_factor": 1.0, "expert_choice_k": 0.125, "final_logit_softcapping": null, "global_attention_every_n_layers": 2, "head_dim": 64, "hidden_dropout": 0.1, "hidden_size": 256, "initializer_range": 0.02, "intermediate_size": 1024, "layer_scale_init": 1e-05, "longformer_attention_window": 512, "max_position_embeddings": 2048, "merge_threshold": 0.5, "mlp_bias": false, "mod_capacity_factor": 0.5, "mod_route_method": "learned", "model_type": "caca", "moe_layer_frequency": 2, "num_attention_heads": 4, "num_expert_groups": 1, "num_experts": 8, "num_experts_per_tok": 2, "num_hidden_layers": 8, "num_key_value_heads": 2, "pipeline_parallel_size": 1, "pretraining_tp": 1, "projector_hidden_size": 256, "qk_norm_eps": 1e-06, "residual_dropout": 0.1, "rms_norm_eps": 1e-06, "rope_scaling": null, "rope_theta": 10000.0, "router_aux_loss_coef": 0.01, "router_z_loss_coef": 0.001, "sliding_window": null, "stochastic_depth_prob": 0.1, "tensor_parallel_size": 1, "tie_word_embeddings": false, "transformers_version": "4.57.3", "use_alibi": false, "use_attention_sink": false, "use_cache": true, "use_cross_attention": false, "use_expert_choice": false, "use_flash_attn": true, "use_grouped_moe": false, "use_grouped_query_attention": false, "use_layer_scale": false, "use_longformer_attention": false, "use_mixture_of_depths": false, "use_moe": false, "use_multi_query_attention": false, "use_multimodal": false, "use_qk_norm": true, "use_rotary_embeddings": true, "use_soft_merging": false, "use_stochastic_depth": false, "vision_config": {}, "vocab_size": 32000 }