{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 1404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0035612535612535613, "grad_norm": 44.833402379319224, "learning_rate": 2.8368794326241136e-07, "loss": 1.9371, "step": 5 }, { "epoch": 0.007122507122507123, "grad_norm": 43.74279507616689, "learning_rate": 6.382978723404255e-07, "loss": 2.009, "step": 10 }, { "epoch": 0.010683760683760684, "grad_norm": 25.37626595626726, "learning_rate": 9.929078014184399e-07, "loss": 1.7062, "step": 15 }, { "epoch": 0.014245014245014245, "grad_norm": 19.266454471290054, "learning_rate": 1.347517730496454e-06, "loss": 1.5971, "step": 20 }, { "epoch": 0.017806267806267807, "grad_norm": 22.479844557734307, "learning_rate": 1.7021276595744682e-06, "loss": 1.6081, "step": 25 }, { "epoch": 0.021367521367521368, "grad_norm": 13.677032332435923, "learning_rate": 2.0567375886524823e-06, "loss": 1.2405, "step": 30 }, { "epoch": 0.02492877492877493, "grad_norm": 19.747718016010605, "learning_rate": 2.4113475177304965e-06, "loss": 1.408, "step": 35 }, { "epoch": 0.02849002849002849, "grad_norm": 16.062267378313283, "learning_rate": 2.765957446808511e-06, "loss": 1.204, "step": 40 }, { "epoch": 0.03205128205128205, "grad_norm": 12.541932714773703, "learning_rate": 3.120567375886525e-06, "loss": 0.9801, "step": 45 }, { "epoch": 0.03561253561253561, "grad_norm": 15.717555199966286, "learning_rate": 3.4751773049645393e-06, "loss": 1.1992, "step": 50 }, { "epoch": 0.03561253561253561, "eval_cooking_sharegpt_test_loss": 1.1925899982452393, "eval_cooking_sharegpt_test_runtime": 25.5385, "eval_cooking_sharegpt_test_samples_per_second": 11.551, "eval_cooking_sharegpt_test_steps_per_second": 0.587, "step": 50 }, { "epoch": 0.03917378917378917, "grad_norm": 14.545864815646969, "learning_rate": 3.8297872340425535e-06, "loss": 1.3385, "step": 55 }, { "epoch": 0.042735042735042736, "grad_norm": 12.301282225896946, "learning_rate": 4.184397163120568e-06, "loss": 1.1679, "step": 60 }, { "epoch": 0.046296296296296294, "grad_norm": 13.876973431997992, "learning_rate": 4.539007092198582e-06, "loss": 1.0384, "step": 65 }, { "epoch": 0.04985754985754986, "grad_norm": 14.102589965762364, "learning_rate": 4.893617021276596e-06, "loss": 1.162, "step": 70 }, { "epoch": 0.053418803418803416, "grad_norm": 14.177427472741687, "learning_rate": 5.24822695035461e-06, "loss": 1.2972, "step": 75 }, { "epoch": 0.05698005698005698, "grad_norm": 15.68318041066717, "learning_rate": 5.602836879432625e-06, "loss": 1.2854, "step": 80 }, { "epoch": 0.06054131054131054, "grad_norm": 18.46212217943087, "learning_rate": 5.957446808510638e-06, "loss": 1.3567, "step": 85 }, { "epoch": 0.0641025641025641, "grad_norm": 13.714247189447388, "learning_rate": 6.312056737588653e-06, "loss": 1.3013, "step": 90 }, { "epoch": 0.06766381766381767, "grad_norm": 15.493009385925916, "learning_rate": 6.666666666666667e-06, "loss": 1.1941, "step": 95 }, { "epoch": 0.07122507122507123, "grad_norm": 14.771809403290197, "learning_rate": 7.021276595744682e-06, "loss": 1.3097, "step": 100 }, { "epoch": 0.07122507122507123, "eval_cooking_sharegpt_test_loss": 1.1718764305114746, "eval_cooking_sharegpt_test_runtime": 25.1276, "eval_cooking_sharegpt_test_samples_per_second": 11.74, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 100 }, { "epoch": 0.07478632478632478, "grad_norm": 14.335846259905333, "learning_rate": 7.375886524822695e-06, "loss": 1.1756, "step": 105 }, { "epoch": 0.07834757834757834, "grad_norm": 14.890684707435781, "learning_rate": 7.73049645390071e-06, "loss": 1.2692, "step": 110 }, { "epoch": 0.08190883190883191, "grad_norm": 12.089051876170808, "learning_rate": 8.085106382978723e-06, "loss": 1.0333, "step": 115 }, { "epoch": 0.08547008547008547, "grad_norm": 10.590259281782513, "learning_rate": 8.439716312056738e-06, "loss": 1.254, "step": 120 }, { "epoch": 0.08903133903133903, "grad_norm": 10.472876249902619, "learning_rate": 8.794326241134753e-06, "loss": 1.1104, "step": 125 }, { "epoch": 0.09259259259259259, "grad_norm": 15.33916465346984, "learning_rate": 9.148936170212767e-06, "loss": 1.2709, "step": 130 }, { "epoch": 0.09615384615384616, "grad_norm": 13.372243798896925, "learning_rate": 9.503546099290782e-06, "loss": 1.1481, "step": 135 }, { "epoch": 0.09971509971509972, "grad_norm": 10.936516357691778, "learning_rate": 9.858156028368795e-06, "loss": 1.324, "step": 140 }, { "epoch": 0.10327635327635327, "grad_norm": 17.160593037501272, "learning_rate": 9.999860789001947e-06, "loss": 1.2962, "step": 145 }, { "epoch": 0.10683760683760683, "grad_norm": 11.569040966332384, "learning_rate": 9.999010083197449e-06, "loss": 1.1811, "step": 150 }, { "epoch": 0.10683760683760683, "eval_cooking_sharegpt_test_loss": 1.1771858930587769, "eval_cooking_sharegpt_test_runtime": 25.1202, "eval_cooking_sharegpt_test_samples_per_second": 11.744, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 150 }, { "epoch": 0.1103988603988604, "grad_norm": 11.3125544118252, "learning_rate": 9.997386142457449e-06, "loss": 1.1202, "step": 155 }, { "epoch": 0.11396011396011396, "grad_norm": 14.371004343512544, "learning_rate": 9.994989217969224e-06, "loss": 1.1595, "step": 160 }, { "epoch": 0.11752136752136752, "grad_norm": 16.59097224676933, "learning_rate": 9.991819680483325e-06, "loss": 1.4383, "step": 165 }, { "epoch": 0.12108262108262108, "grad_norm": 9.526076609732854, "learning_rate": 9.987878020256238e-06, "loss": 1.1714, "step": 170 }, { "epoch": 0.12464387464387465, "grad_norm": 12.651295694075731, "learning_rate": 9.983164846974549e-06, "loss": 1.1081, "step": 175 }, { "epoch": 0.1282051282051282, "grad_norm": 17.182465289526903, "learning_rate": 9.97768088966064e-06, "loss": 1.2748, "step": 180 }, { "epoch": 0.13176638176638178, "grad_norm": 13.788502714941421, "learning_rate": 9.971426996559926e-06, "loss": 1.2061, "step": 185 }, { "epoch": 0.13532763532763534, "grad_norm": 12.372477543200946, "learning_rate": 9.964404135009649e-06, "loss": 1.318, "step": 190 }, { "epoch": 0.1388888888888889, "grad_norm": 12.383450391488157, "learning_rate": 9.956613391289253e-06, "loss": 1.2142, "step": 195 }, { "epoch": 0.14245014245014245, "grad_norm": 14.955599375944699, "learning_rate": 9.948055970452362e-06, "loss": 1.2155, "step": 200 }, { "epoch": 0.14245014245014245, "eval_cooking_sharegpt_test_loss": 1.1475770473480225, "eval_cooking_sharegpt_test_runtime": 25.1728, "eval_cooking_sharegpt_test_samples_per_second": 11.719, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 200 }, { "epoch": 0.146011396011396, "grad_norm": 15.411148131128014, "learning_rate": 9.938733196140386e-06, "loss": 1.2208, "step": 205 }, { "epoch": 0.14957264957264957, "grad_norm": 16.797803119801483, "learning_rate": 9.928646510377782e-06, "loss": 1.2987, "step": 210 }, { "epoch": 0.15313390313390313, "grad_norm": 11.219466141377566, "learning_rate": 9.917797473349e-06, "loss": 1.4056, "step": 215 }, { "epoch": 0.15669515669515668, "grad_norm": 11.827987507537078, "learning_rate": 9.90618776315717e-06, "loss": 1.1961, "step": 220 }, { "epoch": 0.16025641025641027, "grad_norm": 10.811516191771815, "learning_rate": 9.89381917556452e-06, "loss": 1.0973, "step": 225 }, { "epoch": 0.16381766381766383, "grad_norm": 12.109963738326238, "learning_rate": 9.88069362371463e-06, "loss": 1.0753, "step": 230 }, { "epoch": 0.16737891737891739, "grad_norm": 11.506274722218329, "learning_rate": 9.8668131378365e-06, "loss": 1.2293, "step": 235 }, { "epoch": 0.17094017094017094, "grad_norm": 14.707046427485142, "learning_rate": 9.852179864930517e-06, "loss": 1.1595, "step": 240 }, { "epoch": 0.1745014245014245, "grad_norm": 9.017972482125765, "learning_rate": 9.836796068436375e-06, "loss": 1.1666, "step": 245 }, { "epoch": 0.17806267806267806, "grad_norm": 9.826974288232329, "learning_rate": 9.820664127882958e-06, "loss": 0.912, "step": 250 }, { "epoch": 0.17806267806267806, "eval_cooking_sharegpt_test_loss": 1.1096729040145874, "eval_cooking_sharegpt_test_runtime": 25.1281, "eval_cooking_sharegpt_test_samples_per_second": 11.74, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 250 }, { "epoch": 0.18162393162393162, "grad_norm": 12.819309374403112, "learning_rate": 9.803786538520284e-06, "loss": 1.2888, "step": 255 }, { "epoch": 0.18518518518518517, "grad_norm": 14.830959238886223, "learning_rate": 9.786165910933554e-06, "loss": 1.3695, "step": 260 }, { "epoch": 0.18874643874643873, "grad_norm": 10.125063198562845, "learning_rate": 9.767804970639338e-06, "loss": 1.2769, "step": 265 }, { "epoch": 0.19230769230769232, "grad_norm": 10.903684525438488, "learning_rate": 9.748706557664014e-06, "loss": 1.0398, "step": 270 }, { "epoch": 0.19586894586894588, "grad_norm": 9.317054654321662, "learning_rate": 9.728873626104468e-06, "loss": 1.1432, "step": 275 }, { "epoch": 0.19943019943019943, "grad_norm": 11.583163870715609, "learning_rate": 9.708309243671167e-06, "loss": 1.1464, "step": 280 }, { "epoch": 0.202991452991453, "grad_norm": 9.612813249495233, "learning_rate": 9.687016591213648e-06, "loss": 1.2692, "step": 285 }, { "epoch": 0.20655270655270655, "grad_norm": 13.226665826221033, "learning_rate": 9.664998962228523e-06, "loss": 1.2176, "step": 290 }, { "epoch": 0.2101139601139601, "grad_norm": 9.772988407782817, "learning_rate": 9.642259762350034e-06, "loss": 1.0334, "step": 295 }, { "epoch": 0.21367521367521367, "grad_norm": 15.885104782127051, "learning_rate": 9.618802508823287e-06, "loss": 1.1735, "step": 300 }, { "epoch": 0.21367521367521367, "eval_cooking_sharegpt_test_loss": 1.101279616355896, "eval_cooking_sharegpt_test_runtime": 25.1864, "eval_cooking_sharegpt_test_samples_per_second": 11.713, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 300 }, { "epoch": 0.21723646723646722, "grad_norm": 13.551956209995092, "learning_rate": 9.594630829960214e-06, "loss": 1.0606, "step": 305 }, { "epoch": 0.2207977207977208, "grad_norm": 11.38539964528924, "learning_rate": 9.569748464578343e-06, "loss": 1.1898, "step": 310 }, { "epoch": 0.22435897435897437, "grad_norm": 12.952331576522388, "learning_rate": 9.544159261422505e-06, "loss": 1.3299, "step": 315 }, { "epoch": 0.22792022792022792, "grad_norm": 10.79341369860483, "learning_rate": 9.5178671785695e-06, "loss": 1.1731, "step": 320 }, { "epoch": 0.23148148148148148, "grad_norm": 10.119514345693638, "learning_rate": 9.490876282815884e-06, "loss": 1.143, "step": 325 }, { "epoch": 0.23504273504273504, "grad_norm": 10.14646892591297, "learning_rate": 9.463190749048925e-06, "loss": 1.3288, "step": 330 }, { "epoch": 0.2386039886039886, "grad_norm": 12.277175037993954, "learning_rate": 9.434814859600834e-06, "loss": 1.077, "step": 335 }, { "epoch": 0.24216524216524216, "grad_norm": 15.279212663630647, "learning_rate": 9.405753003586396e-06, "loss": 1.4392, "step": 340 }, { "epoch": 0.24572649572649571, "grad_norm": 9.891111512806182, "learning_rate": 9.376009676224057e-06, "loss": 1.1344, "step": 345 }, { "epoch": 0.2492877492877493, "grad_norm": 16.31842090069061, "learning_rate": 9.34558947814063e-06, "loss": 1.2556, "step": 350 }, { "epoch": 0.2492877492877493, "eval_cooking_sharegpt_test_loss": 1.0761160850524902, "eval_cooking_sharegpt_test_runtime": 25.1498, "eval_cooking_sharegpt_test_samples_per_second": 11.73, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 350 }, { "epoch": 0.25284900284900286, "grad_norm": 9.67274081604058, "learning_rate": 9.31449711465967e-06, "loss": 0.9413, "step": 355 }, { "epoch": 0.2564102564102564, "grad_norm": 13.256710854656216, "learning_rate": 9.282737395073676e-06, "loss": 0.9505, "step": 360 }, { "epoch": 0.25997150997151, "grad_norm": 9.604196388467155, "learning_rate": 9.250315231900182e-06, "loss": 0.9932, "step": 365 }, { "epoch": 0.26353276353276356, "grad_norm": 9.820383288211461, "learning_rate": 9.217235640121927e-06, "loss": 0.9902, "step": 370 }, { "epoch": 0.2670940170940171, "grad_norm": 11.105325410969758, "learning_rate": 9.183503736411126e-06, "loss": 1.2068, "step": 375 }, { "epoch": 0.2706552706552707, "grad_norm": 10.192182331571592, "learning_rate": 9.149124738338053e-06, "loss": 1.145, "step": 380 }, { "epoch": 0.2742165242165242, "grad_norm": 11.093920194644765, "learning_rate": 9.114103963563986e-06, "loss": 1.1129, "step": 385 }, { "epoch": 0.2777777777777778, "grad_norm": 10.48038334814352, "learning_rate": 9.078446829018693e-06, "loss": 0.8974, "step": 390 }, { "epoch": 0.2813390313390313, "grad_norm": 11.136260072947428, "learning_rate": 9.042158850062545e-06, "loss": 1.1843, "step": 395 }, { "epoch": 0.2849002849002849, "grad_norm": 8.706608263024991, "learning_rate": 9.00524563963343e-06, "loss": 0.9215, "step": 400 }, { "epoch": 0.2849002849002849, "eval_cooking_sharegpt_test_loss": 1.0584728717803955, "eval_cooking_sharegpt_test_runtime": 25.1363, "eval_cooking_sharegpt_test_samples_per_second": 11.736, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 400 }, { "epoch": 0.28846153846153844, "grad_norm": 9.52814578689494, "learning_rate": 8.967712907378536e-06, "loss": 1.0151, "step": 405 }, { "epoch": 0.292022792022792, "grad_norm": 11.060156661740379, "learning_rate": 8.929566458771213e-06, "loss": 1.2623, "step": 410 }, { "epoch": 0.2955840455840456, "grad_norm": 9.438706343911894, "learning_rate": 8.890812194212987e-06, "loss": 1.0198, "step": 415 }, { "epoch": 0.29914529914529914, "grad_norm": 9.118118476052144, "learning_rate": 8.851456108120906e-06, "loss": 1.0176, "step": 420 }, { "epoch": 0.3027065527065527, "grad_norm": 9.136686057194034, "learning_rate": 8.81150428800033e-06, "loss": 1.1745, "step": 425 }, { "epoch": 0.30626780626780625, "grad_norm": 8.151106084573776, "learning_rate": 8.77096291350334e-06, "loss": 1.0955, "step": 430 }, { "epoch": 0.30982905982905984, "grad_norm": 7.860706824237664, "learning_rate": 8.729838255472875e-06, "loss": 1.3169, "step": 435 }, { "epoch": 0.31339031339031337, "grad_norm": 17.722048411152784, "learning_rate": 8.688136674972784e-06, "loss": 1.0839, "step": 440 }, { "epoch": 0.31695156695156695, "grad_norm": 10.926099468188049, "learning_rate": 8.6458646223039e-06, "loss": 1.071, "step": 445 }, { "epoch": 0.32051282051282054, "grad_norm": 9.719204604808613, "learning_rate": 8.603028636006324e-06, "loss": 1.1962, "step": 450 }, { "epoch": 0.32051282051282054, "eval_cooking_sharegpt_test_loss": 1.0415297746658325, "eval_cooking_sharegpt_test_runtime": 25.1437, "eval_cooking_sharegpt_test_samples_per_second": 11.733, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 450 }, { "epoch": 0.32407407407407407, "grad_norm": 11.243642702609568, "learning_rate": 8.559635341848072e-06, "loss": 1.1421, "step": 455 }, { "epoch": 0.32763532763532766, "grad_norm": 7.923234941167258, "learning_rate": 8.515691451800206e-06, "loss": 1.042, "step": 460 }, { "epoch": 0.3311965811965812, "grad_norm": 7.907018995179874, "learning_rate": 8.471203762998638e-06, "loss": 1.018, "step": 465 }, { "epoch": 0.33475783475783477, "grad_norm": 8.402048802301243, "learning_rate": 8.426179156692784e-06, "loss": 0.9912, "step": 470 }, { "epoch": 0.3383190883190883, "grad_norm": 9.562531305339338, "learning_rate": 8.380624597181165e-06, "loss": 0.858, "step": 475 }, { "epoch": 0.3418803418803419, "grad_norm": 10.445438815619365, "learning_rate": 8.334547130734202e-06, "loss": 1.0234, "step": 480 }, { "epoch": 0.3454415954415954, "grad_norm": 6.521933461862074, "learning_rate": 8.287953884504317e-06, "loss": 1.059, "step": 485 }, { "epoch": 0.349002849002849, "grad_norm": 10.464115086374438, "learning_rate": 8.240852065423507e-06, "loss": 1.1277, "step": 490 }, { "epoch": 0.3525641025641026, "grad_norm": 12.389409593234957, "learning_rate": 8.193248959088604e-06, "loss": 1.1394, "step": 495 }, { "epoch": 0.3561253561253561, "grad_norm": 12.139748159489233, "learning_rate": 8.145151928634362e-06, "loss": 1.1302, "step": 500 }, { "epoch": 0.3561253561253561, "eval_cooking_sharegpt_test_loss": 1.0269293785095215, "eval_cooking_sharegpt_test_runtime": 25.1353, "eval_cooking_sharegpt_test_samples_per_second": 11.737, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 500 }, { "epoch": 0.3596866096866097, "grad_norm": 8.91427993475576, "learning_rate": 8.096568413594533e-06, "loss": 1.197, "step": 505 }, { "epoch": 0.36324786324786323, "grad_norm": 11.99885664273889, "learning_rate": 8.047505928751141e-06, "loss": 1.1582, "step": 510 }, { "epoch": 0.3668091168091168, "grad_norm": 9.282663273615302, "learning_rate": 7.997972062972118e-06, "loss": 0.9002, "step": 515 }, { "epoch": 0.37037037037037035, "grad_norm": 9.201970639869488, "learning_rate": 7.947974478037468e-06, "loss": 1.0111, "step": 520 }, { "epoch": 0.37393162393162394, "grad_norm": 7.021792019342668, "learning_rate": 7.89752090745417e-06, "loss": 0.9395, "step": 525 }, { "epoch": 0.37749287749287747, "grad_norm": 12.461649264071799, "learning_rate": 7.846619155259976e-06, "loss": 1.1162, "step": 530 }, { "epoch": 0.38105413105413105, "grad_norm": 8.731763461970615, "learning_rate": 7.795277094816292e-06, "loss": 1.1337, "step": 535 }, { "epoch": 0.38461538461538464, "grad_norm": 10.365939099504095, "learning_rate": 7.743502667590356e-06, "loss": 0.9248, "step": 540 }, { "epoch": 0.38817663817663817, "grad_norm": 8.821843151827109, "learning_rate": 7.691303881926868e-06, "loss": 0.9228, "step": 545 }, { "epoch": 0.39173789173789175, "grad_norm": 7.987433750859592, "learning_rate": 7.638688811809274e-06, "loss": 0.9909, "step": 550 }, { "epoch": 0.39173789173789175, "eval_cooking_sharegpt_test_loss": 0.9956559538841248, "eval_cooking_sharegpt_test_runtime": 25.169, "eval_cooking_sharegpt_test_samples_per_second": 11.721, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 550 }, { "epoch": 0.3952991452991453, "grad_norm": 8.228166781582255, "learning_rate": 7.585665595610907e-06, "loss": 1.0333, "step": 555 }, { "epoch": 0.39886039886039887, "grad_norm": 9.275535768379024, "learning_rate": 7.532242434836159e-06, "loss": 0.9723, "step": 560 }, { "epoch": 0.4024216524216524, "grad_norm": 8.695313399934099, "learning_rate": 7.478427592851894e-06, "loss": 0.8215, "step": 565 }, { "epoch": 0.405982905982906, "grad_norm": 10.855874044422267, "learning_rate": 7.424229393609291e-06, "loss": 1.204, "step": 570 }, { "epoch": 0.40954415954415957, "grad_norm": 8.204555023023863, "learning_rate": 7.369656220356314e-06, "loss": 0.9245, "step": 575 }, { "epoch": 0.4131054131054131, "grad_norm": 11.251608020595736, "learning_rate": 7.314716514341007e-06, "loss": 1.0165, "step": 580 }, { "epoch": 0.4166666666666667, "grad_norm": 5.688063938388241, "learning_rate": 7.259418773505828e-06, "loss": 1.1104, "step": 585 }, { "epoch": 0.4202279202279202, "grad_norm": 8.239363254620745, "learning_rate": 7.203771551173212e-06, "loss": 0.9522, "step": 590 }, { "epoch": 0.4237891737891738, "grad_norm": 10.844788756143517, "learning_rate": 7.147783454722545e-06, "loss": 1.1714, "step": 595 }, { "epoch": 0.42735042735042733, "grad_norm": 9.68832698247982, "learning_rate": 7.091463144258814e-06, "loss": 0.9888, "step": 600 }, { "epoch": 0.42735042735042733, "eval_cooking_sharegpt_test_loss": 0.9794056415557861, "eval_cooking_sharegpt_test_runtime": 25.1533, "eval_cooking_sharegpt_test_samples_per_second": 11.728, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 600 }, { "epoch": 0.4309116809116809, "grad_norm": 9.606150117246845, "learning_rate": 7.034819331273079e-06, "loss": 1.046, "step": 605 }, { "epoch": 0.43447293447293445, "grad_norm": 13.715546593351425, "learning_rate": 6.9778607772949894e-06, "loss": 1.2122, "step": 610 }, { "epoch": 0.43803418803418803, "grad_norm": 7.3944523471734245, "learning_rate": 6.920596292537582e-06, "loss": 0.8385, "step": 615 }, { "epoch": 0.4415954415954416, "grad_norm": 6.9604425292068415, "learning_rate": 6.8630347345345415e-06, "loss": 0.9568, "step": 620 }, { "epoch": 0.44515669515669515, "grad_norm": 14.912593395050978, "learning_rate": 6.805185006770125e-06, "loss": 1.1887, "step": 625 }, { "epoch": 0.44871794871794873, "grad_norm": 10.386732676943884, "learning_rate": 6.747056057302001e-06, "loss": 0.9092, "step": 630 }, { "epoch": 0.45227920227920226, "grad_norm": 10.175363036394838, "learning_rate": 6.6886568773771865e-06, "loss": 0.875, "step": 635 }, { "epoch": 0.45584045584045585, "grad_norm": 11.544505353798941, "learning_rate": 6.629996500041299e-06, "loss": 1.1254, "step": 640 }, { "epoch": 0.4594017094017094, "grad_norm": 10.65880139786318, "learning_rate": 6.571083998741346e-06, "loss": 0.9673, "step": 645 }, { "epoch": 0.46296296296296297, "grad_norm": 9.13546570087864, "learning_rate": 6.511928485922272e-06, "loss": 0.9264, "step": 650 }, { "epoch": 0.46296296296296297, "eval_cooking_sharegpt_test_loss": 0.970525324344635, "eval_cooking_sharegpt_test_runtime": 25.0887, "eval_cooking_sharegpt_test_samples_per_second": 11.758, "eval_cooking_sharegpt_test_steps_per_second": 0.598, "step": 650 }, { "epoch": 0.46652421652421655, "grad_norm": 8.684744592391311, "learning_rate": 6.452539111617454e-06, "loss": 0.9619, "step": 655 }, { "epoch": 0.4700854700854701, "grad_norm": 9.775487262609003, "learning_rate": 6.3929250620334145e-06, "loss": 1.1151, "step": 660 }, { "epoch": 0.47364672364672367, "grad_norm": 8.338750078415066, "learning_rate": 6.333095558128905e-06, "loss": 1.0223, "step": 665 }, { "epoch": 0.4772079772079772, "grad_norm": 11.378546918298742, "learning_rate": 6.273059854188636e-06, "loss": 1.1187, "step": 670 }, { "epoch": 0.4807692307692308, "grad_norm": 9.327677771719234, "learning_rate": 6.212827236391856e-06, "loss": 1.1275, "step": 675 }, { "epoch": 0.4843304843304843, "grad_norm": 12.797120466370396, "learning_rate": 6.152407021375964e-06, "loss": 1.1803, "step": 680 }, { "epoch": 0.4878917378917379, "grad_norm": 7.871686119587897, "learning_rate": 6.091808554795462e-06, "loss": 1.0758, "step": 685 }, { "epoch": 0.49145299145299143, "grad_norm": 8.782285684574859, "learning_rate": 6.0310412098763685e-06, "loss": 0.9498, "step": 690 }, { "epoch": 0.495014245014245, "grad_norm": 7.976535422520014, "learning_rate": 5.970114385966404e-06, "loss": 1.0068, "step": 695 }, { "epoch": 0.4985754985754986, "grad_norm": 7.25424964835724, "learning_rate": 5.9090375070811215e-06, "loss": 0.9638, "step": 700 }, { "epoch": 0.4985754985754986, "eval_cooking_sharegpt_test_loss": 0.9512233734130859, "eval_cooking_sharegpt_test_runtime": 25.1298, "eval_cooking_sharegpt_test_samples_per_second": 11.739, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 700 }, { "epoch": 0.5021367521367521, "grad_norm": 8.232771918485797, "learning_rate": 5.847820020446212e-06, "loss": 0.952, "step": 705 }, { "epoch": 0.5056980056980057, "grad_norm": 12.19309738852622, "learning_rate": 5.786471395036243e-06, "loss": 0.9951, "step": 710 }, { "epoch": 0.5092592592592593, "grad_norm": 8.465063570959536, "learning_rate": 5.72500112011001e-06, "loss": 0.7584, "step": 715 }, { "epoch": 0.5128205128205128, "grad_norm": 9.189426713538198, "learning_rate": 5.663418703742769e-06, "loss": 1.0317, "step": 720 }, { "epoch": 0.5163817663817664, "grad_norm": 8.689948324565426, "learning_rate": 5.601733671355544e-06, "loss": 1.0824, "step": 725 }, { "epoch": 0.51994301994302, "grad_norm": 9.082539822936914, "learning_rate": 5.53995556424176e-06, "loss": 1.0486, "step": 730 }, { "epoch": 0.5235042735042735, "grad_norm": 8.19463689001411, "learning_rate": 5.4780939380914185e-06, "loss": 0.8952, "step": 735 }, { "epoch": 0.5270655270655271, "grad_norm": 7.059767033906409, "learning_rate": 5.416158361513046e-06, "loss": 0.9262, "step": 740 }, { "epoch": 0.5306267806267806, "grad_norm": 10.913682478063778, "learning_rate": 5.3541584145536475e-06, "loss": 1.0653, "step": 745 }, { "epoch": 0.5341880341880342, "grad_norm": 8.733965389198554, "learning_rate": 5.292103687216875e-06, "loss": 1.0246, "step": 750 }, { "epoch": 0.5341880341880342, "eval_cooking_sharegpt_test_loss": 0.9445996284484863, "eval_cooking_sharegpt_test_runtime": 25.1377, "eval_cooking_sharegpt_test_samples_per_second": 11.735, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 750 }, { "epoch": 0.5377492877492878, "grad_norm": 11.499827601620082, "learning_rate": 5.230003777979692e-06, "loss": 1.2262, "step": 755 }, { "epoch": 0.5413105413105413, "grad_norm": 10.606186966272212, "learning_rate": 5.167868292307679e-06, "loss": 1.0403, "step": 760 }, { "epoch": 0.5448717948717948, "grad_norm": 7.96919172585767, "learning_rate": 5.105706841169301e-06, "loss": 0.9911, "step": 765 }, { "epoch": 0.5484330484330484, "grad_norm": 9.43310127134505, "learning_rate": 5.0435290395492975e-06, "loss": 0.9054, "step": 770 }, { "epoch": 0.551994301994302, "grad_norm": 9.314224624769569, "learning_rate": 4.981344504961459e-06, "loss": 1.1465, "step": 775 }, { "epoch": 0.5555555555555556, "grad_norm": 11.93233414191257, "learning_rate": 4.919162855961022e-06, "loss": 1.0707, "step": 780 }, { "epoch": 0.5591168091168092, "grad_norm": 9.938006328205704, "learning_rate": 4.85699371065688e-06, "loss": 1.0182, "step": 785 }, { "epoch": 0.5626780626780626, "grad_norm": 6.577660815946318, "learning_rate": 4.7948466852238844e-06, "loss": 0.828, "step": 790 }, { "epoch": 0.5662393162393162, "grad_norm": 9.279085577417154, "learning_rate": 4.732731392415448e-06, "loss": 0.9767, "step": 795 }, { "epoch": 0.5698005698005698, "grad_norm": 8.335503043230593, "learning_rate": 4.670657440076645e-06, "loss": 1.0273, "step": 800 }, { "epoch": 0.5698005698005698, "eval_cooking_sharegpt_test_loss": 0.9218639135360718, "eval_cooking_sharegpt_test_runtime": 25.1715, "eval_cooking_sharegpt_test_samples_per_second": 11.72, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 800 }, { "epoch": 0.5733618233618234, "grad_norm": 7.569429593974213, "learning_rate": 4.6086344296581095e-06, "loss": 1.0178, "step": 805 }, { "epoch": 0.5769230769230769, "grad_norm": 6.729714203805775, "learning_rate": 4.546671954730904e-06, "loss": 0.9069, "step": 810 }, { "epoch": 0.5804843304843305, "grad_norm": 9.67272058913767, "learning_rate": 4.484779599502598e-06, "loss": 1.0266, "step": 815 }, { "epoch": 0.584045584045584, "grad_norm": 7.918807492684255, "learning_rate": 4.4229669373348225e-06, "loss": 1.0549, "step": 820 }, { "epoch": 0.5876068376068376, "grad_norm": 8.112311559812307, "learning_rate": 4.361243529262472e-06, "loss": 0.9519, "step": 825 }, { "epoch": 0.5911680911680912, "grad_norm": 7.594376338561705, "learning_rate": 4.2996189225148284e-06, "loss": 0.8388, "step": 830 }, { "epoch": 0.5947293447293447, "grad_norm": 10.189580347416447, "learning_rate": 4.238102649038825e-06, "loss": 0.8922, "step": 835 }, { "epoch": 0.5982905982905983, "grad_norm": 10.319766432633255, "learning_rate": 4.176704224024663e-06, "loss": 0.9128, "step": 840 }, { "epoch": 0.6018518518518519, "grad_norm": 7.400903938112323, "learning_rate": 4.115433144434023e-06, "loss": 0.7854, "step": 845 }, { "epoch": 0.6054131054131054, "grad_norm": 11.602955405807014, "learning_rate": 4.054298887531099e-06, "loss": 1.0135, "step": 850 }, { "epoch": 0.6054131054131054, "eval_cooking_sharegpt_test_loss": 0.9164636135101318, "eval_cooking_sharegpt_test_runtime": 25.1356, "eval_cooking_sharegpt_test_samples_per_second": 11.736, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 850 }, { "epoch": 0.6089743589743589, "grad_norm": 6.833472023976989, "learning_rate": 3.993310909416682e-06, "loss": 0.821, "step": 855 }, { "epoch": 0.6125356125356125, "grad_norm": 11.936740806575038, "learning_rate": 3.932478643565506e-06, "loss": 0.9109, "step": 860 }, { "epoch": 0.6160968660968661, "grad_norm": 10.359678126184791, "learning_rate": 3.8718114993671086e-06, "loss": 0.8265, "step": 865 }, { "epoch": 0.6196581196581197, "grad_norm": 9.6934350511858, "learning_rate": 3.8113188606704004e-06, "loss": 1.0465, "step": 870 }, { "epoch": 0.6232193732193733, "grad_norm": 10.724551346505196, "learning_rate": 3.7510100843322e-06, "loss": 0.8743, "step": 875 }, { "epoch": 0.6267806267806267, "grad_norm": 9.65574624824107, "learning_rate": 3.6908944987699346e-06, "loss": 0.8701, "step": 880 }, { "epoch": 0.6303418803418803, "grad_norm": 10.398287858243952, "learning_rate": 3.630981402518743e-06, "loss": 1.0248, "step": 885 }, { "epoch": 0.6339031339031339, "grad_norm": 9.29596650064249, "learning_rate": 3.5712800627932064e-06, "loss": 0.7934, "step": 890 }, { "epoch": 0.6374643874643875, "grad_norm": 9.777972147521435, "learning_rate": 3.5117997140539073e-06, "loss": 1.0501, "step": 895 }, { "epoch": 0.6410256410256411, "grad_norm": 8.613583947749785, "learning_rate": 3.452549556579069e-06, "loss": 1.0145, "step": 900 }, { "epoch": 0.6410256410256411, "eval_cooking_sharegpt_test_loss": 0.8929095268249512, "eval_cooking_sharegpt_test_runtime": 25.1632, "eval_cooking_sharegpt_test_samples_per_second": 11.723, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 900 }, { "epoch": 0.6445868945868946, "grad_norm": 9.470472358054097, "learning_rate": 3.393538755041485e-06, "loss": 1.0691, "step": 905 }, { "epoch": 0.6481481481481481, "grad_norm": 6.4913464894411375, "learning_rate": 3.334776437090944e-06, "loss": 1.0877, "step": 910 }, { "epoch": 0.6517094017094017, "grad_norm": 7.520341201913034, "learning_rate": 3.276271691942383e-06, "loss": 0.9449, "step": 915 }, { "epoch": 0.6552706552706553, "grad_norm": 8.054250107454035, "learning_rate": 3.218033568969997e-06, "loss": 0.8356, "step": 920 }, { "epoch": 0.6588319088319088, "grad_norm": 8.372218814210639, "learning_rate": 3.1600710763074972e-06, "loss": 1.0003, "step": 925 }, { "epoch": 0.6623931623931624, "grad_norm": 7.833703780065361, "learning_rate": 3.102393179454758e-06, "loss": 0.8384, "step": 930 }, { "epoch": 0.665954415954416, "grad_norm": 7.080976148675942, "learning_rate": 3.0450087998910582e-06, "loss": 0.7447, "step": 935 }, { "epoch": 0.6695156695156695, "grad_norm": 7.7526680766277565, "learning_rate": 2.9879268136951163e-06, "loss": 0.8658, "step": 940 }, { "epoch": 0.6730769230769231, "grad_norm": 7.180472531972662, "learning_rate": 2.9311560501721726e-06, "loss": 0.8843, "step": 945 }, { "epoch": 0.6766381766381766, "grad_norm": 10.35202839609084, "learning_rate": 2.8747052904882845e-06, "loss": 0.8796, "step": 950 }, { "epoch": 0.6766381766381766, "eval_cooking_sharegpt_test_loss": 0.881673276424408, "eval_cooking_sharegpt_test_runtime": 25.1639, "eval_cooking_sharegpt_test_samples_per_second": 11.723, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 950 }, { "epoch": 0.6801994301994302, "grad_norm": 8.362057220633533, "learning_rate": 2.8185832663120817e-06, "loss": 0.9347, "step": 955 }, { "epoch": 0.6837606837606838, "grad_norm": 8.241841325477711, "learning_rate": 2.762798658464172e-06, "loss": 0.8928, "step": 960 }, { "epoch": 0.6873219373219374, "grad_norm": 6.551302209830729, "learning_rate": 2.707360095574411e-06, "loss": 0.7839, "step": 965 }, { "epoch": 0.6908831908831908, "grad_norm": 6.6076016485616655, "learning_rate": 2.6522761527472464e-06, "loss": 0.7627, "step": 970 }, { "epoch": 0.6944444444444444, "grad_norm": 8.77988837920022, "learning_rate": 2.5975553502353413e-06, "loss": 0.8209, "step": 975 }, { "epoch": 0.698005698005698, "grad_norm": 10.773785033508863, "learning_rate": 2.543206152121685e-06, "loss": 1.012, "step": 980 }, { "epoch": 0.7015669515669516, "grad_norm": 11.005117893637754, "learning_rate": 2.4892369650103837e-06, "loss": 1.1062, "step": 985 }, { "epoch": 0.7051282051282052, "grad_norm": 10.695115403778294, "learning_rate": 2.435656136726353e-06, "loss": 1.1155, "step": 990 }, { "epoch": 0.7086894586894587, "grad_norm": 7.487787361260749, "learning_rate": 2.3824719550240876e-06, "loss": 0.8406, "step": 995 }, { "epoch": 0.7122507122507122, "grad_norm": 10.856423210102815, "learning_rate": 2.3296926463057396e-06, "loss": 1.0704, "step": 1000 }, { "epoch": 0.7122507122507122, "eval_cooking_sharegpt_test_loss": 0.8724547028541565, "eval_cooking_sharegpt_test_runtime": 25.1335, "eval_cooking_sharegpt_test_samples_per_second": 11.737, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 1000 }, { "epoch": 0.7158119658119658, "grad_norm": 7.72614857348764, "learning_rate": 2.2773263743486806e-06, "loss": 0.7973, "step": 1005 }, { "epoch": 0.7193732193732194, "grad_norm": 7.759931181855395, "learning_rate": 2.2253812390427325e-06, "loss": 0.9151, "step": 1010 }, { "epoch": 0.7229344729344729, "grad_norm": 11.783476332912898, "learning_rate": 2.173865275137314e-06, "loss": 1.0303, "step": 1015 }, { "epoch": 0.7264957264957265, "grad_norm": 12.061153004333192, "learning_rate": 2.1227864509986358e-06, "loss": 1.0098, "step": 1020 }, { "epoch": 0.73005698005698, "grad_norm": 9.467966100641553, "learning_rate": 2.0721526673771674e-06, "loss": 1.0981, "step": 1025 }, { "epoch": 0.7336182336182336, "grad_norm": 12.78076769543128, "learning_rate": 2.0219717561855857e-06, "loss": 0.9828, "step": 1030 }, { "epoch": 0.7371794871794872, "grad_norm": 9.392109838630013, "learning_rate": 1.9722514792873348e-06, "loss": 0.7925, "step": 1035 }, { "epoch": 0.7407407407407407, "grad_norm": 12.742615228478014, "learning_rate": 1.922999527296046e-06, "loss": 0.8235, "step": 1040 }, { "epoch": 0.7443019943019943, "grad_norm": 6.948962068993543, "learning_rate": 1.8742235183859747e-06, "loss": 0.8002, "step": 1045 }, { "epoch": 0.7478632478632479, "grad_norm": 7.707762985139897, "learning_rate": 1.8259309971136441e-06, "loss": 0.9156, "step": 1050 }, { "epoch": 0.7478632478632479, "eval_cooking_sharegpt_test_loss": 0.8574908375740051, "eval_cooking_sharegpt_test_runtime": 25.1297, "eval_cooking_sharegpt_test_samples_per_second": 11.739, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 1050 }, { "epoch": 0.7514245014245015, "grad_norm": 13.101359051256853, "learning_rate": 1.7781294332508532e-06, "loss": 0.8874, "step": 1055 }, { "epoch": 0.7549857549857549, "grad_norm": 11.726365244747157, "learning_rate": 1.7308262206292898e-06, "loss": 1.1045, "step": 1060 }, { "epoch": 0.7585470085470085, "grad_norm": 13.003352236596228, "learning_rate": 1.6840286759968593e-06, "loss": 0.9162, "step": 1065 }, { "epoch": 0.7621082621082621, "grad_norm": 7.45149127073539, "learning_rate": 1.6377440378859532e-06, "loss": 0.862, "step": 1070 }, { "epoch": 0.7656695156695157, "grad_norm": 9.426189330985709, "learning_rate": 1.591979465493806e-06, "loss": 0.9508, "step": 1075 }, { "epoch": 0.7692307692307693, "grad_norm": 7.157063282706341, "learning_rate": 1.5467420375751285e-06, "loss": 0.706, "step": 1080 }, { "epoch": 0.7727920227920227, "grad_norm": 9.717033901286259, "learning_rate": 1.5020387513471878e-06, "loss": 0.9039, "step": 1085 }, { "epoch": 0.7763532763532763, "grad_norm": 9.632692635104767, "learning_rate": 1.4578765214074842e-06, "loss": 0.846, "step": 1090 }, { "epoch": 0.7799145299145299, "grad_norm": 10.252871708668916, "learning_rate": 1.414262178664228e-06, "loss": 0.9271, "step": 1095 }, { "epoch": 0.7834757834757835, "grad_norm": 8.889977409022517, "learning_rate": 1.371202469279751e-06, "loss": 1.0533, "step": 1100 }, { "epoch": 0.7834757834757835, "eval_cooking_sharegpt_test_loss": 0.851457953453064, "eval_cooking_sharegpt_test_runtime": 25.1652, "eval_cooking_sharegpt_test_samples_per_second": 11.723, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 1100 }, { "epoch": 0.7870370370370371, "grad_norm": 8.468302321532613, "learning_rate": 1.3287040536270135e-06, "loss": 0.9639, "step": 1105 }, { "epoch": 0.7905982905982906, "grad_norm": 6.629557905957202, "learning_rate": 1.2867735052594083e-06, "loss": 0.8639, "step": 1110 }, { "epoch": 0.7941595441595442, "grad_norm": 7.980307587558109, "learning_rate": 1.2454173098939715e-06, "loss": 0.8244, "step": 1115 }, { "epoch": 0.7977207977207977, "grad_norm": 11.90234537404138, "learning_rate": 1.2046418644081904e-06, "loss": 1.0325, "step": 1120 }, { "epoch": 0.8012820512820513, "grad_norm": 9.539029676112726, "learning_rate": 1.1644534758505476e-06, "loss": 0.9097, "step": 1125 }, { "epoch": 0.8048433048433048, "grad_norm": 6.748924825533006, "learning_rate": 1.1248583604649639e-06, "loss": 0.7796, "step": 1130 }, { "epoch": 0.8084045584045584, "grad_norm": 10.559494436276363, "learning_rate": 1.0858626427292796e-06, "loss": 1.1171, "step": 1135 }, { "epoch": 0.811965811965812, "grad_norm": 7.308475257839191, "learning_rate": 1.0474723544079406e-06, "loss": 0.8793, "step": 1140 }, { "epoch": 0.8155270655270656, "grad_norm": 6.611820127778117, "learning_rate": 1.009693433619024e-06, "loss": 0.8264, "step": 1145 }, { "epoch": 0.8190883190883191, "grad_norm": 9.464697425274275, "learning_rate": 9.72531723915726e-07, "loss": 0.9419, "step": 1150 }, { "epoch": 0.8190883190883191, "eval_cooking_sharegpt_test_loss": 0.8405951857566833, "eval_cooking_sharegpt_test_runtime": 25.1452, "eval_cooking_sharegpt_test_samples_per_second": 11.732, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 1150 }, { "epoch": 0.8226495726495726, "grad_norm": 6.834807537166378, "learning_rate": 9.359929733825157e-07, "loss": 0.8623, "step": 1155 }, { "epoch": 0.8262108262108262, "grad_norm": 7.882351226038981, "learning_rate": 9.000828337460226e-07, "loss": 0.8607, "step": 1160 }, { "epoch": 0.8297720797720798, "grad_norm": 9.808423484159281, "learning_rate": 8.648068595008458e-07, "loss": 0.9397, "step": 1165 }, { "epoch": 0.8333333333333334, "grad_norm": 7.461469730293582, "learning_rate": 8.301705070503957e-07, "loss": 0.8206, "step": 1170 }, { "epoch": 0.8368945868945868, "grad_norm": 12.981921418633185, "learning_rate": 7.961791338629127e-07, "loss": 0.9414, "step": 1175 }, { "epoch": 0.8404558404558404, "grad_norm": 9.121636387718846, "learning_rate": 7.628379976427868e-07, "loss": 0.9753, "step": 1180 }, { "epoch": 0.844017094017094, "grad_norm": 7.120190586098952, "learning_rate": 7.30152255517303e-07, "loss": 0.7873, "step": 1185 }, { "epoch": 0.8475783475783476, "grad_norm": 10.01810752311074, "learning_rate": 6.981269632389603e-07, "loss": 1.0376, "step": 1190 }, { "epoch": 0.8511396011396012, "grad_norm": 6.925155458181609, "learning_rate": 6.667670744034498e-07, "loss": 0.777, "step": 1195 }, { "epoch": 0.8547008547008547, "grad_norm": 9.565521598630072, "learning_rate": 6.360774396834468e-07, "loss": 0.7706, "step": 1200 }, { "epoch": 0.8547008547008547, "eval_cooking_sharegpt_test_loss": 0.8347706198692322, "eval_cooking_sharegpt_test_runtime": 25.169, "eval_cooking_sharegpt_test_samples_per_second": 11.721, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 1200 }, { "epoch": 0.8582621082621082, "grad_norm": 8.922222684845552, "learning_rate": 6.060628060783235e-07, "loss": 0.9442, "step": 1205 }, { "epoch": 0.8618233618233618, "grad_norm": 8.307345994609264, "learning_rate": 5.767278161798912e-07, "loss": 0.8203, "step": 1210 }, { "epoch": 0.8653846153846154, "grad_norm": 8.637121416954832, "learning_rate": 5.480770074542979e-07, "loss": 0.9698, "step": 1215 }, { "epoch": 0.8689458689458689, "grad_norm": 9.871553780456603, "learning_rate": 5.20114811540181e-07, "loss": 0.8873, "step": 1220 }, { "epoch": 0.8725071225071225, "grad_norm": 9.8312201601066, "learning_rate": 4.92845553563196e-07, "loss": 0.7253, "step": 1225 }, { "epoch": 0.8760683760683761, "grad_norm": 8.250412383894227, "learning_rate": 4.6627345146700974e-07, "loss": 1.0156, "step": 1230 }, { "epoch": 0.8796296296296297, "grad_norm": 8.580990920000557, "learning_rate": 4.4040261536088533e-07, "loss": 0.8478, "step": 1235 }, { "epoch": 0.8831908831908832, "grad_norm": 8.094207522496205, "learning_rate": 4.1523704688394176e-07, "loss": 0.7699, "step": 1240 }, { "epoch": 0.8867521367521367, "grad_norm": 6.781712567016349, "learning_rate": 3.9078063858617956e-07, "loss": 0.7973, "step": 1245 }, { "epoch": 0.8903133903133903, "grad_norm": 9.196236625649593, "learning_rate": 3.670371733264011e-07, "loss": 0.7966, "step": 1250 }, { "epoch": 0.8903133903133903, "eval_cooking_sharegpt_test_loss": 0.8313595056533813, "eval_cooking_sharegpt_test_runtime": 25.1348, "eval_cooking_sharegpt_test_samples_per_second": 11.737, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 1250 }, { "epoch": 0.8938746438746439, "grad_norm": 6.498737764870842, "learning_rate": 3.440103236870823e-07, "loss": 0.854, "step": 1255 }, { "epoch": 0.8974358974358975, "grad_norm": 7.159593152971597, "learning_rate": 3.217036514063082e-07, "loss": 0.8703, "step": 1260 }, { "epoch": 0.9009971509971509, "grad_norm": 7.8665606238963495, "learning_rate": 3.001206068268486e-07, "loss": 0.8048, "step": 1265 }, { "epoch": 0.9045584045584045, "grad_norm": 8.328902546236389, "learning_rate": 2.792645283624712e-07, "loss": 0.9001, "step": 1270 }, { "epoch": 0.9081196581196581, "grad_norm": 10.59559443290987, "learning_rate": 2.591386419815611e-07, "loss": 0.8995, "step": 1275 }, { "epoch": 0.9116809116809117, "grad_norm": 8.602142707085447, "learning_rate": 2.3974606070813586e-07, "loss": 0.8291, "step": 1280 }, { "epoch": 0.9152421652421653, "grad_norm": 7.8762061850035066, "learning_rate": 2.210897841403331e-07, "loss": 0.6669, "step": 1285 }, { "epoch": 0.9188034188034188, "grad_norm": 9.12669014044684, "learning_rate": 2.0317269798643734e-07, "loss": 0.7548, "step": 1290 }, { "epoch": 0.9223646723646723, "grad_norm": 9.767085379154615, "learning_rate": 1.8599757361852377e-07, "loss": 0.9479, "step": 1295 }, { "epoch": 0.9259259259259259, "grad_norm": 9.22746537014953, "learning_rate": 1.6956706764379438e-07, "loss": 0.8913, "step": 1300 }, { "epoch": 0.9259259259259259, "eval_cooking_sharegpt_test_loss": 0.8287807703018188, "eval_cooking_sharegpt_test_runtime": 25.1514, "eval_cooking_sharegpt_test_samples_per_second": 11.729, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 1300 }, { "epoch": 0.9294871794871795, "grad_norm": 7.5898355965195625, "learning_rate": 1.538837214936545e-07, "loss": 0.9466, "step": 1305 }, { "epoch": 0.9330484330484331, "grad_norm": 8.673728999485782, "learning_rate": 1.3894996103061553e-07, "loss": 0.8301, "step": 1310 }, { "epoch": 0.9366096866096866, "grad_norm": 6.9836595058978, "learning_rate": 1.2476809617306408e-07, "loss": 0.8211, "step": 1315 }, { "epoch": 0.9401709401709402, "grad_norm": 9.029505361516508, "learning_rate": 1.1134032053797151e-07, "loss": 0.7824, "step": 1320 }, { "epoch": 0.9437321937321937, "grad_norm": 9.315098828395778, "learning_rate": 9.866871110158938e-08, "loss": 0.8833, "step": 1325 }, { "epoch": 0.9472934472934473, "grad_norm": 8.684150309957673, "learning_rate": 8.675522787819023e-08, "loss": 0.7958, "step": 1330 }, { "epoch": 0.9508547008547008, "grad_norm": 7.140798759942066, "learning_rate": 7.56017136168935e-08, "loss": 0.8184, "step": 1335 }, { "epoch": 0.9544159544159544, "grad_norm": 10.172031288704963, "learning_rate": 6.520989351663564e-08, "loss": 0.7399, "step": 1340 }, { "epoch": 0.957977207977208, "grad_norm": 10.991446855606016, "learning_rate": 5.5581374959320366e-08, "loss": 0.712, "step": 1345 }, { "epoch": 0.9615384615384616, "grad_norm": 5.894774647977172, "learning_rate": 4.671764726119299e-08, "loss": 0.9853, "step": 1350 }, { "epoch": 0.9615384615384616, "eval_cooking_sharegpt_test_loss": 0.8273342251777649, "eval_cooking_sharegpt_test_runtime": 25.1865, "eval_cooking_sharegpt_test_samples_per_second": 11.713, "eval_cooking_sharegpt_test_steps_per_second": 0.596, "step": 1350 }, { "epoch": 0.9650997150997151, "grad_norm": 9.38775698699051, "learning_rate": 3.8620081442475864e-08, "loss": 0.8198, "step": 1355 }, { "epoch": 0.9686609686609686, "grad_norm": 9.674254033681382, "learning_rate": 3.128993001530245e-08, "loss": 0.8105, "step": 1360 }, { "epoch": 0.9722222222222222, "grad_norm": 7.151085985782544, "learning_rate": 2.4728326789980606e-08, "loss": 0.7844, "step": 1365 }, { "epoch": 0.9757834757834758, "grad_norm": 7.533191380424758, "learning_rate": 1.8936286699620664e-08, "loss": 0.9371, "step": 1370 }, { "epoch": 0.9793447293447294, "grad_norm": 8.25120065581009, "learning_rate": 1.3914705643143788e-08, "loss": 0.7351, "step": 1375 }, { "epoch": 0.9829059829059829, "grad_norm": 13.140339025291984, "learning_rate": 9.664360346710033e-09, "loss": 0.819, "step": 1380 }, { "epoch": 0.9864672364672364, "grad_norm": 12.24884145145148, "learning_rate": 6.1859082435750115e-09, "loss": 0.9526, "step": 1385 }, { "epoch": 0.99002849002849, "grad_norm": 7.260571264137407, "learning_rate": 3.4798873723984604e-09, "loss": 0.8759, "step": 1390 }, { "epoch": 0.9935897435897436, "grad_norm": 13.23771358391951, "learning_rate": 1.5467162940230318e-09, "loss": 1.0067, "step": 1395 }, { "epoch": 0.9971509971509972, "grad_norm": 7.184551159684903, "learning_rate": 3.8669402673274794e-10, "loss": 0.7037, "step": 1400 }, { "epoch": 0.9971509971509972, "eval_cooking_sharegpt_test_loss": 0.827165424823761, "eval_cooking_sharegpt_test_runtime": 25.1122, "eval_cooking_sharegpt_test_samples_per_second": 11.747, "eval_cooking_sharegpt_test_steps_per_second": 0.597, "step": 1400 }, { "epoch": 1.0, "step": 1404, "total_flos": 4037501583360.0, "train_loss": 1.0267896625051471, "train_runtime": 8316.6296, "train_samples_per_second": 0.675, "train_steps_per_second": 0.169 } ], "logging_steps": 5, "max_steps": 1404, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4037501583360.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }