See axolotl config

axolotl version: 0.9.2

base_model: giux78/zagreus-test-202000
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

#tokenizer_type: auto
strict: false
seed: 1337
output_dir: ./ale_outputs/zagreus-350M-sft

# === Datasets ===
streaming: false
#datasets:
#  - path: /leonardo_work/EUHPC_A04_045/training/sft_data #/leonardo_work/EUHPC_A04_045/training/test_data #/leonardo_work/EUHPC_A04_045/.data
#    type: chat_template
#    chat_template: tokenizer_default_fallback_chatml
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#    roles:
#      user: ["human","user"]
#      assistant: ["gpt","assistant"]
#      system: ["system"]
#      tool: ["tool"]
#   roles_to_train: ["assistant"]
#   train_on_eos: "turn"

default_system_message: "Sei un assistente utile."
chat_template_jinja: |
        {% set has_system = messages and messages[0]['role'] == 'system' %}
        {% set system_text = (messages[0]['content'] if has_system else (default_system_message or "")) | trim %}

        {{ bos_token }}
        {% if system_text %}
                {{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_text + '<|eot_id|>' }}
        {% endif %}

        {% set loop_messages = messages[1:] if has_system else messages %}
        {% for message in loop_messages %}
                {% set role = message['role'] %}
                {% set content = (message['content'] | trim) %}
                {% if role in ['user','assistant','tool'] and content %}
                        {{ '<|start_header_id|>' + role + '<|end_header_id|>\n\n' + content + '<|eot_id|>' }}
                {% endif %}
        {% endfor %}

        {# Se l'ultimo messaggio NON è dell'assistant, apri l'header assistant per la generazione #}
        {% if loop_messages|length == 0 or loop_messages[-1]['role'] != 'assistant' %}
                {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
        {% endif %}

        {{ eos_token }}

datasets:
  - path:  /leonardo_work/EUHPC_A04_045/.data #/leonardo_work/EUHPC_A04_045/training/test_data_cleaned /leonardo_work/EUHPC_A04_045/training/sft_data
    # 'type: chat_template' is the correct type for this task.
    type: chat_template
    # Your original mapping was correct for reading your dataset format. We keep it.
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
    roles:
      user: ["human", "user"]
      assistant: ["gpt", "assistant"]
      system: ["system"]
      tool: ["tool"]
    roles_to_train: ["assistant"]
    train_on_eos: "turn" 

# === Sequencing / packing ===
sequence_len: 4096
sample_packing: true
remove_unused_columns: false   # <-- aggiungi questa riga
eval_sample_packing: false
pad_to_sequence_len: true
streaming_multipack_buffer_size: 10000

# === Ottimizzazione ===
#optimizer: adamw_torch_fused
#learning_rate: 2e-5
#lr_scheduler: cosine
#warmup_ratio: 0.1
#weight_decay: 0.0

optimizer: adamw_torch_fused
lr_scheduler: cosine
#warmup_ratio: 0.01
#weight_decay: 0.10
learning_rate: 1e-5          # VALORE FONDAMENTALE: Ridotto da 5e-4 a un valore sicuro per SFT.
weight_decay: 0.01           # Regolarizzazione più leggera, standard per SFT.
#warmup_ratio: 0.03           # Warmup più lungo per una maggiore stabilità iniziale.
warmup_steps: 100
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-8
max_grad_norm: 1.0


# === Batch (per GPU) ===
micro_batch_size: 1
gradient_accumulation_steps: 8
# Eff. batch = micro_batch_size * grad_accum * num_gpus = 1 * 8 * 32 = 256

# === Precisione / memoria ===
bf16: auto
tf32: true
flash_attention: true
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false

# === FSDP (Axolotl usa fsdp_config; la chiave "fsdp:" è deprecata) ===
fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD           # shard di param, grad e optimizer state
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_use_orig_params: false
  fsdp_sync_module_states: true
  fsdp_limit_all_gathers: true
  fsdp_cpu_ram_efficient_loading: true
  fsdp_offload_params: false                   # attivalo solo se VRAM è stretta (vedi variante)
  fsdp_state_dict_type: SHARDED_STATE_DICT     # checkpoint più leggeri su cluster multi-nodo

# === Loop di training ===
num_epochs: 1                                  # con 170GB basta una passata
# max_steps: 200000                            # alternativa: budget a step/token

# === Eval / checkpoint ===
val_set_size: 0.003
evals_per_epoch: 8
save_steps: 1000          # salva ogni 2.000 step (metti il valore che preferisci)
save_total_limit: 5
logging_steps: 20


# === Tracciamento ===
wandb_mode: "offline"
wandb_project: zagreus-350M-sft
wandb_entity: mii-llm
wandb_name: sft

# === Token speciali ===
special_tokens:
  bos_token: <|begin_of_text|>
  pad_token: <|end_of_text|>
  eos_token: <|end_of_text|>
  #eos_token: <|eot_id|>
  unk_token: <unk>

ale_outputs/zagreus-350M-sft

This model is a fine-tuned version of giux78/zagreus-test-202000 on the None dataset. It achieves the following results on the evaluation set:

Loss: 2.2755

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

learning_rate: 1e-05
train_batch_size: 1
eval_batch_size: 1
seed: 1337
distributed_type: multi-GPU
num_devices: 32
gradient_accumulation_steps: 8
total_train_batch_size: 256
total_eval_batch_size: 32
optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
lr_scheduler_type: cosine
lr_scheduler_warmup_steps: 100
num_epochs: 1.0

Training results

Training Loss	Epoch	Step	Validation Loss
No log	0.0003	1	2.2805
3.9252	0.1250	433	2.2768
3.8908	0.2501	866	2.2755
3.8096	0.3751	1299	2.2756
3.7972	0.5001	1732	2.2758
3.8346	0.6252	2165	2.2756
3.8084	0.7502	2598	2.2756
3.8643	0.8753	3031	2.2755

Framework versions

Transformers 4.56.2
Pytorch 2.5.1+cu121
Datasets 3.5.1
Tokenizers 0.22.1

Downloads last month: 2

Safetensors

Model size

0.4B params

Tensor type

BF16

Model tree for giux78/zagreus-test-202000-sft-6

Base model

giux78/zagreus-test-202000

Finetuned

(13)

this model