Built with Axolotl

See axolotl config

axolotl version: 0.9.2

base_model: giux78/zagreus-test-202000
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

#tokenizer_type: auto
strict: false
seed: 1337
output_dir: ./ale_outputs/zagreus-350M-sft

# === Datasets ===
streaming: false
#datasets:
#  - path: /leonardo_work/EUHPC_A04_045/training/sft_data #/leonardo_work/EUHPC_A04_045/training/test_data #/leonardo_work/EUHPC_A04_045/.data
#    type: chat_template
#    chat_template: tokenizer_default_fallback_chatml
#    field_messages: conversations
#    message_property_mappings:
#      role: from
#      content: value
#    roles:
#      user: ["human","user"]
#      assistant: ["gpt","assistant"]
#      system: ["system"]
#      tool: ["tool"]
#   roles_to_train: ["assistant"]
#   train_on_eos: "turn"

default_system_message: "Sei un assistente utile."
chat_template_jinja: |
        {% set has_system = messages and messages[0]['role'] == 'system' %}
        {% set system_text = (messages[0]['content'] if has_system else (default_system_message or "")) | trim %}

        {{ bos_token }}
        {% if system_text %}
                {{ '<|start_header_id|>system<|end_header_id|>\n\n' + system_text + '<|eot_id|>' }}
        {% endif %}

        {% set loop_messages = messages[1:] if has_system else messages %}
        {% for message in loop_messages %}
                {% set role = message['role'] %}
                {% set content = (message['content'] | trim) %}
                {% if role in ['user','assistant','tool'] and content %}
                        {{ '<|start_header_id|>' + role + '<|end_header_id|>\n\n' + content + '<|eot_id|>' }}
                {% endif %}
        {% endfor %}

        {# Se l'ultimo messaggio NON è dell'assistant, apri l'header assistant per la generazione #}
        {% if loop_messages|length == 0 or loop_messages[-1]['role'] != 'assistant' %}
                {{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
        {% endif %}

        {{ eos_token }}

datasets:
  - path:  /leonardo_work/EUHPC_A04_045/.data #/leonardo_work/EUHPC_A04_045/training/test_data_cleaned /leonardo_work/EUHPC_A04_045/training/sft_data
    # 'type: chat_template' is the correct type for this task.
    type: chat_template
    # Your original mapping was correct for reading your dataset format. We keep it.
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
    roles:
      user: ["human", "user"]
      assistant: ["gpt", "assistant"]
      system: ["system"]
      tool: ["tool"]
    roles_to_train: ["assistant"]
    train_on_eos: "turn" 

# === Sequencing / packing ===
sequence_len: 4096
sample_packing: true
remove_unused_columns: false   # <-- aggiungi questa riga
eval_sample_packing: false
pad_to_sequence_len: true
streaming_multipack_buffer_size: 10000

# === Ottimizzazione ===
#optimizer: adamw_torch_fused
#learning_rate: 2e-5
#lr_scheduler: cosine
#warmup_ratio: 0.1
#weight_decay: 0.0

optimizer: adamw_torch_fused
lr_scheduler: cosine
#warmup_ratio: 0.01
#weight_decay: 0.10
learning_rate: 1e-5          # VALORE FONDAMENTALE: Ridotto da 5e-4 a un valore sicuro per SFT.
weight_decay: 0.01           # Regolarizzazione più leggera, standard per SFT.
#warmup_ratio: 0.03           # Warmup più lungo per una maggiore stabilità iniziale.
warmup_steps: 100
adam_beta1: 0.9
adam_beta2: 0.95
adam_epsilon: 1e-8
max_grad_norm: 1.0


# === Batch (per GPU) ===
micro_batch_size: 1
gradient_accumulation_steps: 8
# Eff. batch = micro_batch_size * grad_accum * num_gpus = 1 * 8 * 32 = 256

# === Precisione / memoria ===
bf16: auto
tf32: true
flash_attention: true
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false

# === FSDP (Axolotl usa fsdp_config; la chiave "fsdp:" è deprecata) ===
fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD           # shard di param, grad e optimizer state
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_use_orig_params: false
  fsdp_sync_module_states: true
  fsdp_limit_all_gathers: true
  fsdp_cpu_ram_efficient_loading: true
  fsdp_offload_params: false                   # attivalo solo se VRAM è stretta (vedi variante)
  fsdp_state_dict_type: SHARDED_STATE_DICT     # checkpoint più leggeri su cluster multi-nodo

# === Loop di training ===
num_epochs: 1                                  # con 170GB basta una passata
# max_steps: 200000                            # alternativa: budget a step/token

# === Eval / checkpoint ===
val_set_size: 0.003
evals_per_epoch: 8
save_steps: 1000          # salva ogni 2.000 step (metti il valore che preferisci)
save_total_limit: 5
logging_steps: 20


# === Tracciamento ===
wandb_mode: "offline"
wandb_project: zagreus-350M-sft
wandb_entity: mii-llm
wandb_name: sft

# === Token speciali ===
special_tokens:
  bos_token: <|begin_of_text|>
  pad_token: <|end_of_text|>
  eos_token: <|end_of_text|>
  #eos_token: <|eot_id|>
  unk_token: <unk>
 

ale_outputs/zagreus-350M-sft

This model is a fine-tuned version of giux78/zagreus-test-202000 on the None dataset. It achieves the following results on the evaluation set:

  • Loss: 2.2755

Model description

More information needed

Intended uses & limitations

More information needed

Training and evaluation data

More information needed

Training procedure

Training hyperparameters

The following hyperparameters were used during training:

  • learning_rate: 1e-05
  • train_batch_size: 1
  • eval_batch_size: 1
  • seed: 1337
  • distributed_type: multi-GPU
  • num_devices: 32
  • gradient_accumulation_steps: 8
  • total_train_batch_size: 256
  • total_eval_batch_size: 32
  • optimizer: Use OptimizerNames.ADAMW_TORCH_FUSED with betas=(0.9,0.95) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
  • lr_scheduler_type: cosine
  • lr_scheduler_warmup_steps: 100
  • num_epochs: 1.0

Training results

Training Loss Epoch Step Validation Loss
No log 0.0003 1 2.2805
3.9252 0.1250 433 2.2768
3.8908 0.2501 866 2.2755
3.8096 0.3751 1299 2.2756
3.7972 0.5001 1732 2.2758
3.8346 0.6252 2165 2.2756
3.8084 0.7502 2598 2.2756
3.8643 0.8753 3031 2.2755

Framework versions

  • Transformers 4.56.2
  • Pytorch 2.5.1+cu121
  • Datasets 3.5.1
  • Tokenizers 0.22.1
Downloads last month
2
Safetensors
Model size
0.4B params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for giux78/zagreus-test-202000-sft-6

Finetuned
(13)
this model