| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9891956782713085, | |
| "eval_steps": 50, | |
| "global_step": 624, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04801920768307323, | |
| "grad_norm": 3.922592356078373, | |
| "learning_rate": 1.4285714285714286e-06, | |
| "loss": 0.4239, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09603841536614646, | |
| "grad_norm": 1.048139141700484, | |
| "learning_rate": 3.015873015873016e-06, | |
| "loss": 0.3052, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.14405762304921968, | |
| "grad_norm": 0.8538085575650297, | |
| "learning_rate": 4.603174603174604e-06, | |
| "loss": 0.2483, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.19207683073229292, | |
| "grad_norm": 0.7208408522041903, | |
| "learning_rate": 6.1904761904761914e-06, | |
| "loss": 0.2312, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.24009603841536614, | |
| "grad_norm": 0.6048748741273176, | |
| "learning_rate": 7.77777777777778e-06, | |
| "loss": 0.1988, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.24009603841536614, | |
| "eval_loss": 0.2022361308336258, | |
| "eval_runtime": 8.7258, | |
| "eval_samples_per_second": 15.471, | |
| "eval_steps_per_second": 1.948, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.28811524609843936, | |
| "grad_norm": 0.6463317852210085, | |
| "learning_rate": 9.365079365079366e-06, | |
| "loss": 0.1855, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.33613445378151263, | |
| "grad_norm": 0.7181148124716302, | |
| "learning_rate": 9.99717787871887e-06, | |
| "loss": 0.1874, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.38415366146458585, | |
| "grad_norm": 0.6510412334843603, | |
| "learning_rate": 9.979943117513265e-06, | |
| "loss": 0.1732, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.43217286914765907, | |
| "grad_norm": 0.6418219788086171, | |
| "learning_rate": 9.947095408534483e-06, | |
| "loss": 0.1798, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.4801920768307323, | |
| "grad_norm": 0.5909891764283862, | |
| "learning_rate": 9.898737734799134e-06, | |
| "loss": 0.1671, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4801920768307323, | |
| "eval_loss": 0.17217175662517548, | |
| "eval_runtime": 8.7329, | |
| "eval_samples_per_second": 15.459, | |
| "eval_steps_per_second": 1.947, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.5282112845138055, | |
| "grad_norm": 0.5490095561563818, | |
| "learning_rate": 9.835021705636201e-06, | |
| "loss": 0.1672, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.5762304921968787, | |
| "grad_norm": 0.5919523660281423, | |
| "learning_rate": 9.756147081366673e-06, | |
| "loss": 0.1661, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.6242496998799519, | |
| "grad_norm": 0.5553308704075627, | |
| "learning_rate": 9.66236114702178e-06, | |
| "loss": 0.1678, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.6722689075630253, | |
| "grad_norm": 0.5921710123949474, | |
| "learning_rate": 9.55395793706341e-06, | |
| "loss": 0.1604, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.7202881152460985, | |
| "grad_norm": 0.6297463589804964, | |
| "learning_rate": 9.43127731353729e-06, | |
| "loss": 0.1581, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7202881152460985, | |
| "eval_loss": 0.15990422666072845, | |
| "eval_runtime": 8.7401, | |
| "eval_samples_per_second": 15.446, | |
| "eval_steps_per_second": 1.945, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.7683073229291717, | |
| "grad_norm": 0.5203866298970263, | |
| "learning_rate": 9.294703900549096e-06, | |
| "loss": 0.1608, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.8163265306122449, | |
| "grad_norm": 0.5100770394064513, | |
| "learning_rate": 9.14466587840408e-06, | |
| "loss": 0.162, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.8643457382953181, | |
| "grad_norm": 0.5455220998770557, | |
| "learning_rate": 8.981633641190779e-06, | |
| "loss": 0.1566, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.9123649459783914, | |
| "grad_norm": 0.48213355422073995, | |
| "learning_rate": 8.806118322017525e-06, | |
| "loss": 0.1486, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.9603841536614646, | |
| "grad_norm": 0.49326253901345773, | |
| "learning_rate": 8.61867019052535e-06, | |
| "loss": 0.1513, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.9603841536614646, | |
| "eval_loss": 0.1510133445262909, | |
| "eval_runtime": 8.7214, | |
| "eval_samples_per_second": 15.479, | |
| "eval_steps_per_second": 1.949, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.0048019207683074, | |
| "grad_norm": 1.2575485399779591, | |
| "learning_rate": 8.41987692770139e-06, | |
| "loss": 0.1452, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.0528211284513807, | |
| "grad_norm": 0.49974946529817293, | |
| "learning_rate": 8.210361783401491e-06, | |
| "loss": 0.1086, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.1008403361344539, | |
| "grad_norm": 0.5909720459940095, | |
| "learning_rate": 7.990781622358535e-06, | |
| "loss": 0.1098, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.148859543817527, | |
| "grad_norm": 0.5289140362803703, | |
| "learning_rate": 7.76182486480253e-06, | |
| "loss": 0.1042, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.1968787515006003, | |
| "grad_norm": 0.5570818538274178, | |
| "learning_rate": 7.524209328148995e-06, | |
| "loss": 0.1104, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.1968787515006003, | |
| "eval_loss": 0.15508781373500824, | |
| "eval_runtime": 8.7041, | |
| "eval_samples_per_second": 15.51, | |
| "eval_steps_per_second": 1.953, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 1.2448979591836735, | |
| "grad_norm": 0.5285122850283916, | |
| "learning_rate": 7.278679976522279e-06, | |
| "loss": 0.1024, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.2929171668667467, | |
| "grad_norm": 0.5198193011546469, | |
| "learning_rate": 7.026006585169467e-06, | |
| "loss": 0.1067, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 1.34093637454982, | |
| "grad_norm": 0.5433402449341223, | |
| "learning_rate": 6.766981327087271e-06, | |
| "loss": 0.1106, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 1.3889555822328932, | |
| "grad_norm": 0.591675736467511, | |
| "learning_rate": 6.502416289428282e-06, | |
| "loss": 0.1027, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 1.4369747899159664, | |
| "grad_norm": 0.49602130865146205, | |
| "learning_rate": 6.233140927473033e-06, | |
| "loss": 0.1068, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4369747899159664, | |
| "eval_loss": 0.14931099116802216, | |
| "eval_runtime": 8.7136, | |
| "eval_samples_per_second": 15.493, | |
| "eval_steps_per_second": 1.951, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 1.4849939975990396, | |
| "grad_norm": 0.5205229035157025, | |
| "learning_rate": 5.959999464150101e-06, | |
| "loss": 0.1043, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 1.5330132052821128, | |
| "grad_norm": 0.5213512259437505, | |
| "learning_rate": 5.683848243257181e-06, | |
| "loss": 0.1058, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.581032412965186, | |
| "grad_norm": 0.5252416105681743, | |
| "learning_rate": 5.40555304468122e-06, | |
| "loss": 0.1035, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 1.6290516206482593, | |
| "grad_norm": 0.479035517749776, | |
| "learning_rate": 5.125986370034862e-06, | |
| "loss": 0.1032, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 1.6770708283313325, | |
| "grad_norm": 0.47617986914412186, | |
| "learning_rate": 4.846024707219149e-06, | |
| "loss": 0.1006, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.6770708283313325, | |
| "eval_loss": 0.14417614042758942, | |
| "eval_runtime": 8.7246, | |
| "eval_samples_per_second": 15.473, | |
| "eval_steps_per_second": 1.949, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 1.725090036014406, | |
| "grad_norm": 0.5665748104077248, | |
| "learning_rate": 4.566545782488554e-06, | |
| "loss": 0.1019, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 1.773109243697479, | |
| "grad_norm": 0.5167955766147265, | |
| "learning_rate": 4.2884258086335755e-06, | |
| "loss": 0.0976, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 1.8211284513805523, | |
| "grad_norm": 0.5042866007823615, | |
| "learning_rate": 4.012536737908288e-06, | |
| "loss": 0.1003, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 1.8691476590636253, | |
| "grad_norm": 0.5495519515030309, | |
| "learning_rate": 3.7397435283153795e-06, | |
| "loss": 0.0991, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 1.9171668667466988, | |
| "grad_norm": 0.4815628493479367, | |
| "learning_rate": 3.4709014318193298e-06, | |
| "loss": 0.1029, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.9171668667466988, | |
| "eval_loss": 0.14110355079174042, | |
| "eval_runtime": 8.7254, | |
| "eval_samples_per_second": 15.472, | |
| "eval_steps_per_second": 1.948, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 1.9651860744297718, | |
| "grad_norm": 0.5378667536535642, | |
| "learning_rate": 3.2068533129896273e-06, | |
| "loss": 0.1035, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 2.009603841536615, | |
| "grad_norm": 0.4775497724990149, | |
| "learning_rate": 2.948427006480528e-06, | |
| "loss": 0.0912, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 2.057623049219688, | |
| "grad_norm": 0.6087554919185391, | |
| "learning_rate": 2.696432721632082e-06, | |
| "loss": 0.059, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 2.1056422569027613, | |
| "grad_norm": 0.466286415886031, | |
| "learning_rate": 2.4516605023294626e-06, | |
| "loss": 0.0567, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 2.1536614645858343, | |
| "grad_norm": 0.5402767853471913, | |
| "learning_rate": 2.2148777500843125e-06, | |
| "loss": 0.0617, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.1536614645858343, | |
| "eval_loss": 0.1581123322248459, | |
| "eval_runtime": 8.7373, | |
| "eval_samples_per_second": 15.451, | |
| "eval_steps_per_second": 1.946, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 2.2016806722689077, | |
| "grad_norm": 0.5420430891075357, | |
| "learning_rate": 1.9868268181037186e-06, | |
| "loss": 0.0584, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 2.2496998799519807, | |
| "grad_norm": 0.4884067898678699, | |
| "learning_rate": 1.768222683889757e-06, | |
| "loss": 0.058, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 2.297719087635054, | |
| "grad_norm": 0.49574699167442754, | |
| "learning_rate": 1.5597507076664187e-06, | |
| "loss": 0.0588, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 2.345738295318127, | |
| "grad_norm": 0.484031285869081, | |
| "learning_rate": 1.362064483661617e-06, | |
| "loss": 0.0555, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 2.3937575030012006, | |
| "grad_norm": 0.49868843579846434, | |
| "learning_rate": 1.1757837909808628e-06, | |
| "loss": 0.0584, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.3937575030012006, | |
| "eval_loss": 0.1588136851787567, | |
| "eval_runtime": 8.722, | |
| "eval_samples_per_second": 15.478, | |
| "eval_steps_per_second": 1.949, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 2.4417767106842736, | |
| "grad_norm": 0.505195377790213, | |
| "learning_rate": 1.0014926504969535e-06, | |
| "loss": 0.0568, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 2.489795918367347, | |
| "grad_norm": 0.48055085037067635, | |
| "learning_rate": 8.397374938476594e-07, | |
| "loss": 0.057, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 2.53781512605042, | |
| "grad_norm": 0.48774374180423785, | |
| "learning_rate": 6.910254502818914e-07, | |
| "loss": 0.0562, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 2.5858343337334935, | |
| "grad_norm": 0.506896175562434, | |
| "learning_rate": 5.558227567253832e-07, | |
| "loss": 0.0571, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 2.6338535414165665, | |
| "grad_norm": 0.4794928604324473, | |
| "learning_rate": 4.3455329605058436e-07, | |
| "loss": 0.0585, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.6338535414165665, | |
| "eval_loss": 0.1571720838546753, | |
| "eval_runtime": 8.7061, | |
| "eval_samples_per_second": 15.506, | |
| "eval_steps_per_second": 1.953, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 2.68187274909964, | |
| "grad_norm": 0.4447078585594017, | |
| "learning_rate": 3.275972681335421e-07, | |
| "loss": 0.0557, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 2.729891956782713, | |
| "grad_norm": 0.5074668455489234, | |
| "learning_rate": 2.3528999786421758e-07, | |
| "loss": 0.0551, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 2.7779111644657863, | |
| "grad_norm": 0.4931479047936826, | |
| "learning_rate": 1.5792088384733174e-07, | |
| "loss": 0.0578, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 2.82593037214886, | |
| "grad_norm": 0.4945483234224741, | |
| "learning_rate": 9.573249108973281e-08, | |
| "loss": 0.0571, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 2.8739495798319328, | |
| "grad_norm": 0.49071499326291995, | |
| "learning_rate": 4.891979051886153e-08, | |
| "loss": 0.0552, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.8739495798319328, | |
| "eval_loss": 0.15641489624977112, | |
| "eval_runtime": 8.7188, | |
| "eval_samples_per_second": 15.484, | |
| "eval_steps_per_second": 1.95, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 2.9219687875150058, | |
| "grad_norm": 0.4898132970657211, | |
| "learning_rate": 1.762954771655001e-08, | |
| "loss": 0.058, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 2.969987995198079, | |
| "grad_norm": 0.44485985772895487, | |
| "learning_rate": 1.959862784577937e-09, | |
| "loss": 0.0548, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 2.9891956782713085, | |
| "step": 624, | |
| "total_flos": 170865984536576.0, | |
| "train_loss": 0.1184023514103431, | |
| "train_runtime": 9824.5273, | |
| "train_samples_per_second": 4.068, | |
| "train_steps_per_second": 0.064 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 624, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 200, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 170865984536576.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |