caca-250M-untrained / config.json
Lyon28's picture
Initial commit: caca-250M untrained model
21afc98 verified
{
"architectures": [
"CacaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attention_pattern": "all_global",
"attention_sink_size": 4,
"attention_sink_window": 1024,
"attn_logit_softcapping": null,
"audio_config": {},
"auto_map": {
"AutoConfig": "caca_transformers.CacaConfig",
"AutoModel": "caca_transformers.CacaModel",
"AutoModelForCausalLM": "caca_transformers.CacaForCausalLM"
},
"bos_token_id": 1,
"chat_template": "{% for message in messages %}{% if message['role'] == 'system' %}System: {{ message['content'] }}\n{% elif message['role'] == 'user' %}User: {{ message['content'] }}\n{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}\n{% endif %}{% endfor %}{% if add_generation_prompt %}Assistant:{% endif %}",
"cross_attention_frequency": 4,
"dtype": "float32",
"eos_token_id": 2,
"expert_capacity_factor": 1.0,
"expert_choice_k": 0.125,
"final_logit_softcapping": null,
"global_attention_every_n_layers": 2,
"head_dim": 64,
"hidden_dropout": 0.1,
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_scale_init": 1e-05,
"longformer_attention_window": 512,
"max_position_embeddings": 8192,
"merge_threshold": 0.5,
"mlp_bias": false,
"mod_capacity_factor": 0.5,
"mod_route_method": "learned",
"model_type": "caca",
"moe_layer_frequency": 2,
"num_attention_heads": 16,
"num_expert_groups": 1,
"num_experts": 8,
"num_experts_per_tok": 2,
"num_hidden_layers": 24,
"num_key_value_heads": 4,
"pipeline_parallel_size": 1,
"pretraining_tp": 1,
"projector_hidden_size": 1024,
"qk_norm_eps": 1e-06,
"residual_dropout": 0.1,
"rms_norm_eps": 1e-06,
"rope_scaling": null,
"rope_theta": 10000.0,
"router_aux_loss_coef": 0.01,
"router_z_loss_coef": 0.001,
"sliding_window": null,
"stochastic_depth_prob": 0.1,
"tensor_parallel_size": 1,
"tie_word_embeddings": false,
"transformers_version": "4.57.3",
"use_alibi": false,
"use_attention_sink": false,
"use_cache": true,
"use_cross_attention": false,
"use_expert_choice": false,
"use_flash_attn": true,
"use_grouped_moe": false,
"use_grouped_query_attention": false,
"use_layer_scale": false,
"use_longformer_attention": false,
"use_mixture_of_depths": false,
"use_moe": false,
"use_multi_query_attention": false,
"use_multimodal": false,
"use_qk_norm": true,
"use_rotary_embeddings": true,
"use_soft_merging": false,
"use_stochastic_depth": false,
"vision_config": {},
"vocab_size": 32000
}