# torchtitan Config.toml # NOTE: this toml config is a preset for 64 A100 GPUs. [job] dump_folder = "./llama3_1b_output" description = "Llama 3 1B training" enable_wandb = true [profiling] enable_profiling = true save_traces_folder = "profile_trace" profile_freq = 100 [metrics] log_freq = 10 enable_tensorboard = true save_tb_folder = "tb" [model] name = "llama3" flavor = "1B" tokenizer_path = "./assets/tokenizer/Llama-3.1-8B" # converters = ["float8"] [optimizer] name = "AdamW" lr = 3e-4 eps = 1e-8 [lr_scheduler] warmup_steps = 2000 # lr scheduler warm up [training] local_batch_size = 4 global_batch_size = 128 seq_len = 8192 max_norm = 1.0 # grad norm clipping steps = 20000 compile = false dataset = "fineweb_edu_100bt" [parallelism] data_parallel_replicate_degree = 1 data_parallel_shard_degree = -1 tensor_parallel_degree = 1 pipeline_parallel_degree = 1 context_parallel_degree = 1 [checkpoint] enable_checkpoint = true folder = "./llama3_1b_output/checkpoint" interval = 500 last_save_model_only = true export_dtype = "float32" async_mode = "disabled" # ["disabled", "async", "async_with_pinned_mem"] [activation_checkpoint] mode = "selective" # ["none", "selective", "full"] selective_ac_option = "op" # "int" = ac every positive int layer or 'op', ac based on ops policy [float8] enable_fsdp_float8_all_gather = false precompute_float8_dynamic_scale_for_fsdp = false filter_fqns = ["output"]