{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9891956782713085, "eval_steps": 50, "global_step": 624, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04801920768307323, "grad_norm": 3.922592356078373, "learning_rate": 1.4285714285714286e-06, "loss": 0.4239, "step": 10 }, { "epoch": 0.09603841536614646, "grad_norm": 1.048139141700484, "learning_rate": 3.015873015873016e-06, "loss": 0.3052, "step": 20 }, { "epoch": 0.14405762304921968, "grad_norm": 0.8538085575650297, "learning_rate": 4.603174603174604e-06, "loss": 0.2483, "step": 30 }, { "epoch": 0.19207683073229292, "grad_norm": 0.7208408522041903, "learning_rate": 6.1904761904761914e-06, "loss": 0.2312, "step": 40 }, { "epoch": 0.24009603841536614, "grad_norm": 0.6048748741273176, "learning_rate": 7.77777777777778e-06, "loss": 0.1988, "step": 50 }, { "epoch": 0.24009603841536614, "eval_loss": 0.2022361308336258, "eval_runtime": 8.7258, "eval_samples_per_second": 15.471, "eval_steps_per_second": 1.948, "step": 50 }, { "epoch": 0.28811524609843936, "grad_norm": 0.6463317852210085, "learning_rate": 9.365079365079366e-06, "loss": 0.1855, "step": 60 }, { "epoch": 0.33613445378151263, "grad_norm": 0.7181148124716302, "learning_rate": 9.99717787871887e-06, "loss": 0.1874, "step": 70 }, { "epoch": 0.38415366146458585, "grad_norm": 0.6510412334843603, "learning_rate": 9.979943117513265e-06, "loss": 0.1732, "step": 80 }, { "epoch": 0.43217286914765907, "grad_norm": 0.6418219788086171, "learning_rate": 9.947095408534483e-06, "loss": 0.1798, "step": 90 }, { "epoch": 0.4801920768307323, "grad_norm": 0.5909891764283862, "learning_rate": 9.898737734799134e-06, "loss": 0.1671, "step": 100 }, { "epoch": 0.4801920768307323, "eval_loss": 0.17217175662517548, "eval_runtime": 8.7329, "eval_samples_per_second": 15.459, "eval_steps_per_second": 1.947, "step": 100 }, { "epoch": 0.5282112845138055, "grad_norm": 0.5490095561563818, "learning_rate": 9.835021705636201e-06, "loss": 0.1672, "step": 110 }, { "epoch": 0.5762304921968787, "grad_norm": 0.5919523660281423, "learning_rate": 9.756147081366673e-06, "loss": 0.1661, "step": 120 }, { "epoch": 0.6242496998799519, "grad_norm": 0.5553308704075627, "learning_rate": 9.66236114702178e-06, "loss": 0.1678, "step": 130 }, { "epoch": 0.6722689075630253, "grad_norm": 0.5921710123949474, "learning_rate": 9.55395793706341e-06, "loss": 0.1604, "step": 140 }, { "epoch": 0.7202881152460985, "grad_norm": 0.6297463589804964, "learning_rate": 9.43127731353729e-06, "loss": 0.1581, "step": 150 }, { "epoch": 0.7202881152460985, "eval_loss": 0.15990422666072845, "eval_runtime": 8.7401, "eval_samples_per_second": 15.446, "eval_steps_per_second": 1.945, "step": 150 }, { "epoch": 0.7683073229291717, "grad_norm": 0.5203866298970263, "learning_rate": 9.294703900549096e-06, "loss": 0.1608, "step": 160 }, { "epoch": 0.8163265306122449, "grad_norm": 0.5100770394064513, "learning_rate": 9.14466587840408e-06, "loss": 0.162, "step": 170 }, { "epoch": 0.8643457382953181, "grad_norm": 0.5455220998770557, "learning_rate": 8.981633641190779e-06, "loss": 0.1566, "step": 180 }, { "epoch": 0.9123649459783914, "grad_norm": 0.48213355422073995, "learning_rate": 8.806118322017525e-06, "loss": 0.1486, "step": 190 }, { "epoch": 0.9603841536614646, "grad_norm": 0.49326253901345773, "learning_rate": 8.61867019052535e-06, "loss": 0.1513, "step": 200 }, { "epoch": 0.9603841536614646, "eval_loss": 0.1510133445262909, "eval_runtime": 8.7214, "eval_samples_per_second": 15.479, "eval_steps_per_second": 1.949, "step": 200 }, { "epoch": 1.0048019207683074, "grad_norm": 1.2575485399779591, "learning_rate": 8.41987692770139e-06, "loss": 0.1452, "step": 210 }, { "epoch": 1.0528211284513807, "grad_norm": 0.49974946529817293, "learning_rate": 8.210361783401491e-06, "loss": 0.1086, "step": 220 }, { "epoch": 1.1008403361344539, "grad_norm": 0.5909720459940095, "learning_rate": 7.990781622358535e-06, "loss": 0.1098, "step": 230 }, { "epoch": 1.148859543817527, "grad_norm": 0.5289140362803703, "learning_rate": 7.76182486480253e-06, "loss": 0.1042, "step": 240 }, { "epoch": 1.1968787515006003, "grad_norm": 0.5570818538274178, "learning_rate": 7.524209328148995e-06, "loss": 0.1104, "step": 250 }, { "epoch": 1.1968787515006003, "eval_loss": 0.15508781373500824, "eval_runtime": 8.7041, "eval_samples_per_second": 15.51, "eval_steps_per_second": 1.953, "step": 250 }, { "epoch": 1.2448979591836735, "grad_norm": 0.5285122850283916, "learning_rate": 7.278679976522279e-06, "loss": 0.1024, "step": 260 }, { "epoch": 1.2929171668667467, "grad_norm": 0.5198193011546469, "learning_rate": 7.026006585169467e-06, "loss": 0.1067, "step": 270 }, { "epoch": 1.34093637454982, "grad_norm": 0.5433402449341223, "learning_rate": 6.766981327087271e-06, "loss": 0.1106, "step": 280 }, { "epoch": 1.3889555822328932, "grad_norm": 0.591675736467511, "learning_rate": 6.502416289428282e-06, "loss": 0.1027, "step": 290 }, { "epoch": 1.4369747899159664, "grad_norm": 0.49602130865146205, "learning_rate": 6.233140927473033e-06, "loss": 0.1068, "step": 300 }, { "epoch": 1.4369747899159664, "eval_loss": 0.14931099116802216, "eval_runtime": 8.7136, "eval_samples_per_second": 15.493, "eval_steps_per_second": 1.951, "step": 300 }, { "epoch": 1.4849939975990396, "grad_norm": 0.5205229035157025, "learning_rate": 5.959999464150101e-06, "loss": 0.1043, "step": 310 }, { "epoch": 1.5330132052821128, "grad_norm": 0.5213512259437505, "learning_rate": 5.683848243257181e-06, "loss": 0.1058, "step": 320 }, { "epoch": 1.581032412965186, "grad_norm": 0.5252416105681743, "learning_rate": 5.40555304468122e-06, "loss": 0.1035, "step": 330 }, { "epoch": 1.6290516206482593, "grad_norm": 0.479035517749776, "learning_rate": 5.125986370034862e-06, "loss": 0.1032, "step": 340 }, { "epoch": 1.6770708283313325, "grad_norm": 0.47617986914412186, "learning_rate": 4.846024707219149e-06, "loss": 0.1006, "step": 350 }, { "epoch": 1.6770708283313325, "eval_loss": 0.14417614042758942, "eval_runtime": 8.7246, "eval_samples_per_second": 15.473, "eval_steps_per_second": 1.949, "step": 350 }, { "epoch": 1.725090036014406, "grad_norm": 0.5665748104077248, "learning_rate": 4.566545782488554e-06, "loss": 0.1019, "step": 360 }, { "epoch": 1.773109243697479, "grad_norm": 0.5167955766147265, "learning_rate": 4.2884258086335755e-06, "loss": 0.0976, "step": 370 }, { "epoch": 1.8211284513805523, "grad_norm": 0.5042866007823615, "learning_rate": 4.012536737908288e-06, "loss": 0.1003, "step": 380 }, { "epoch": 1.8691476590636253, "grad_norm": 0.5495519515030309, "learning_rate": 3.7397435283153795e-06, "loss": 0.0991, "step": 390 }, { "epoch": 1.9171668667466988, "grad_norm": 0.4815628493479367, "learning_rate": 3.4709014318193298e-06, "loss": 0.1029, "step": 400 }, { "epoch": 1.9171668667466988, "eval_loss": 0.14110355079174042, "eval_runtime": 8.7254, "eval_samples_per_second": 15.472, "eval_steps_per_second": 1.948, "step": 400 }, { "epoch": 1.9651860744297718, "grad_norm": 0.5378667536535642, "learning_rate": 3.2068533129896273e-06, "loss": 0.1035, "step": 410 }, { "epoch": 2.009603841536615, "grad_norm": 0.4775497724990149, "learning_rate": 2.948427006480528e-06, "loss": 0.0912, "step": 420 }, { "epoch": 2.057623049219688, "grad_norm": 0.6087554919185391, "learning_rate": 2.696432721632082e-06, "loss": 0.059, "step": 430 }, { "epoch": 2.1056422569027613, "grad_norm": 0.466286415886031, "learning_rate": 2.4516605023294626e-06, "loss": 0.0567, "step": 440 }, { "epoch": 2.1536614645858343, "grad_norm": 0.5402767853471913, "learning_rate": 2.2148777500843125e-06, "loss": 0.0617, "step": 450 }, { "epoch": 2.1536614645858343, "eval_loss": 0.1581123322248459, "eval_runtime": 8.7373, "eval_samples_per_second": 15.451, "eval_steps_per_second": 1.946, "step": 450 }, { "epoch": 2.2016806722689077, "grad_norm": 0.5420430891075357, "learning_rate": 1.9868268181037186e-06, "loss": 0.0584, "step": 460 }, { "epoch": 2.2496998799519807, "grad_norm": 0.4884067898678699, "learning_rate": 1.768222683889757e-06, "loss": 0.058, "step": 470 }, { "epoch": 2.297719087635054, "grad_norm": 0.49574699167442754, "learning_rate": 1.5597507076664187e-06, "loss": 0.0588, "step": 480 }, { "epoch": 2.345738295318127, "grad_norm": 0.484031285869081, "learning_rate": 1.362064483661617e-06, "loss": 0.0555, "step": 490 }, { "epoch": 2.3937575030012006, "grad_norm": 0.49868843579846434, "learning_rate": 1.1757837909808628e-06, "loss": 0.0584, "step": 500 }, { "epoch": 2.3937575030012006, "eval_loss": 0.1588136851787567, "eval_runtime": 8.722, "eval_samples_per_second": 15.478, "eval_steps_per_second": 1.949, "step": 500 }, { "epoch": 2.4417767106842736, "grad_norm": 0.505195377790213, "learning_rate": 1.0014926504969535e-06, "loss": 0.0568, "step": 510 }, { "epoch": 2.489795918367347, "grad_norm": 0.48055085037067635, "learning_rate": 8.397374938476594e-07, "loss": 0.057, "step": 520 }, { "epoch": 2.53781512605042, "grad_norm": 0.48774374180423785, "learning_rate": 6.910254502818914e-07, "loss": 0.0562, "step": 530 }, { "epoch": 2.5858343337334935, "grad_norm": 0.506896175562434, "learning_rate": 5.558227567253832e-07, "loss": 0.0571, "step": 540 }, { "epoch": 2.6338535414165665, "grad_norm": 0.4794928604324473, "learning_rate": 4.3455329605058436e-07, "loss": 0.0585, "step": 550 }, { "epoch": 2.6338535414165665, "eval_loss": 0.1571720838546753, "eval_runtime": 8.7061, "eval_samples_per_second": 15.506, "eval_steps_per_second": 1.953, "step": 550 }, { "epoch": 2.68187274909964, "grad_norm": 0.4447078585594017, "learning_rate": 3.275972681335421e-07, "loss": 0.0557, "step": 560 }, { "epoch": 2.729891956782713, "grad_norm": 0.5074668455489234, "learning_rate": 2.3528999786421758e-07, "loss": 0.0551, "step": 570 }, { "epoch": 2.7779111644657863, "grad_norm": 0.4931479047936826, "learning_rate": 1.5792088384733174e-07, "loss": 0.0578, "step": 580 }, { "epoch": 2.82593037214886, "grad_norm": 0.4945483234224741, "learning_rate": 9.573249108973281e-08, "loss": 0.0571, "step": 590 }, { "epoch": 2.8739495798319328, "grad_norm": 0.49071499326291995, "learning_rate": 4.891979051886153e-08, "loss": 0.0552, "step": 600 }, { "epoch": 2.8739495798319328, "eval_loss": 0.15641489624977112, "eval_runtime": 8.7188, "eval_samples_per_second": 15.484, "eval_steps_per_second": 1.95, "step": 600 }, { "epoch": 2.9219687875150058, "grad_norm": 0.4898132970657211, "learning_rate": 1.762954771655001e-08, "loss": 0.058, "step": 610 }, { "epoch": 2.969987995198079, "grad_norm": 0.44485985772895487, "learning_rate": 1.959862784577937e-09, "loss": 0.0548, "step": 620 }, { "epoch": 2.9891956782713085, "step": 624, "total_flos": 170865984536576.0, "train_loss": 0.1184023514103431, "train_runtime": 9824.5273, "train_samples_per_second": 4.068, "train_steps_per_second": 0.064 } ], "logging_steps": 10, "max_steps": 624, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 170865984536576.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }