| { | |
| "architectures": [ | |
| "CacaForCausalLM" | |
| ], | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "attention_pattern": "all_global", | |
| "attention_sink_size": 4, | |
| "attention_sink_window": 1024, | |
| "attn_logit_softcapping": null, | |
| "audio_config": {}, | |
| "auto_map": { | |
| "AutoConfig": "caca_transformers.CacaConfig", | |
| "AutoModel": "caca_transformers.CacaModel", | |
| "AutoModelForCausalLM": "caca_transformers.CacaForCausalLM" | |
| }, | |
| "bos_token_id": 1, | |
| "chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System: {{ message['content'] }}\n{% elif message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}{% if add_generation_prompt %}Assistant:{% endif %}", | |
| "cross_attention_frequency": 4, | |
| "dtype": "float32", | |
| "eos_token_id": 2, | |
| "expert_capacity_factor": 1.0, | |
| "expert_choice_k": 0.125, | |
| "final_logit_softcapping": null, | |
| "global_attention_every_n_layers": 2, | |
| "head_dim": 64, | |
| "hidden_dropout": 0.1, | |
| "hidden_size": 640, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 2560, | |
| "layer_scale_init": 1e-05, | |
| "longformer_attention_window": 512, | |
| "max_position_embeddings": 4096, | |
| "merge_threshold": 0.5, | |
| "mlp_bias": false, | |
| "mod_capacity_factor": 0.5, | |
| "mod_route_method": "learned", | |
| "model_type": "caca", | |
| "moe_layer_frequency": 2, | |
| "num_attention_heads": 10, | |
| "num_expert_groups": 1, | |
| "num_experts": 8, | |
| "num_experts_per_tok": 2, | |
| "num_hidden_layers": 16, | |
| "num_key_value_heads": 2, | |
| "pipeline_parallel_size": 1, | |
| "pretraining_tp": 1, | |
| "projector_hidden_size": 640, | |
| "qk_norm_eps": 1e-06, | |
| "residual_dropout": 0.1, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": null, | |
| "rope_theta": 10000.0, | |
| "router_aux_loss_coef": 0.01, | |
| "router_z_loss_coef": 0.001, | |
| "sliding_window": null, | |
| "stochastic_depth_prob": 0.1, | |
| "tensor_parallel_size": 1, | |
| "tie_word_embeddings": false, | |
| "transformers_version": "4.57.3", | |
| "use_alibi": false, | |
| "use_attention_sink": false, | |
| "use_cache": true, | |
| "use_cross_attention": false, | |
| "use_expert_choice": false, | |
| "use_flash_attn": true, | |
| "use_grouped_moe": false, | |
| "use_grouped_query_attention": false, | |
| "use_layer_scale": false, | |
| "use_longformer_attention": false, | |
| "use_mixture_of_depths": false, | |
| "use_moe": false, | |
| "use_multi_query_attention": false, | |
| "use_multimodal": false, | |
| "use_qk_norm": true, | |
| "use_rotary_embeddings": true, | |
| "use_soft_merging": false, | |
| "use_stochastic_depth": false, | |
| "vision_config": {}, | |
| "vocab_size": 32000 | |
| } | |