{ "best_global_step": 1008, "best_metric": 0.38490504026412964, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_1754652157/checkpoint-1008", "epoch": 10.0, "eval_steps": 63, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04, "grad_norm": 2.4841861724853516, "learning_rate": 1.6000000000000001e-06, "loss": 15.2081, "num_input_tokens_seen": 2144, "step": 5 }, { "epoch": 0.08, "grad_norm": 2.424849510192871, "learning_rate": 3.6e-06, "loss": 15.3148, "num_input_tokens_seen": 4128, "step": 10 }, { "epoch": 0.12, "grad_norm": 2.6069109439849854, "learning_rate": 5.600000000000001e-06, "loss": 15.3875, "num_input_tokens_seen": 6240, "step": 15 }, { "epoch": 0.16, "grad_norm": 2.666384696960449, "learning_rate": 7.6e-06, "loss": 15.1604, "num_input_tokens_seen": 8096, "step": 20 }, { "epoch": 0.2, "grad_norm": 2.383948564529419, "learning_rate": 9.600000000000001e-06, "loss": 15.358, "num_input_tokens_seen": 10112, "step": 25 }, { "epoch": 0.24, "grad_norm": 2.555549383163452, "learning_rate": 1.16e-05, "loss": 14.8947, "num_input_tokens_seen": 12032, "step": 30 }, { "epoch": 0.28, "grad_norm": 2.3365485668182373, "learning_rate": 1.3600000000000002e-05, "loss": 14.9376, "num_input_tokens_seen": 13824, "step": 35 }, { "epoch": 0.32, "grad_norm": 2.37636137008667, "learning_rate": 1.56e-05, "loss": 15.0763, "num_input_tokens_seen": 15840, "step": 40 }, { "epoch": 0.36, "grad_norm": 2.863443374633789, "learning_rate": 1.76e-05, "loss": 14.7281, "num_input_tokens_seen": 17920, "step": 45 }, { "epoch": 0.4, "grad_norm": 2.697338819503784, "learning_rate": 1.9600000000000002e-05, "loss": 14.723, "num_input_tokens_seen": 19712, "step": 50 }, { "epoch": 0.44, "grad_norm": 2.416774272918701, "learning_rate": 2.16e-05, "loss": 14.5953, "num_input_tokens_seen": 21952, "step": 55 }, { "epoch": 0.48, "grad_norm": 2.4722342491149902, "learning_rate": 2.36e-05, "loss": 13.9838, "num_input_tokens_seen": 24160, "step": 60 }, { "epoch": 0.504, "eval_loss": 13.871087074279785, "eval_runtime": 0.8458, "eval_samples_per_second": 66.213, "eval_steps_per_second": 16.553, "num_input_tokens_seen": 25504, "step": 63 }, { "epoch": 0.52, "grad_norm": 2.531996250152588, "learning_rate": 2.5600000000000002e-05, "loss": 13.8098, "num_input_tokens_seen": 26112, "step": 65 }, { "epoch": 0.56, "grad_norm": 2.4052157402038574, "learning_rate": 2.7600000000000003e-05, "loss": 13.7342, "num_input_tokens_seen": 28064, "step": 70 }, { "epoch": 0.6, "grad_norm": 2.380267381668091, "learning_rate": 2.96e-05, "loss": 13.545, "num_input_tokens_seen": 29824, "step": 75 }, { "epoch": 0.64, "grad_norm": 2.2633209228515625, "learning_rate": 3.16e-05, "loss": 13.168, "num_input_tokens_seen": 31904, "step": 80 }, { "epoch": 0.68, "grad_norm": 2.5283195972442627, "learning_rate": 3.3600000000000004e-05, "loss": 13.0662, "num_input_tokens_seen": 33984, "step": 85 }, { "epoch": 0.72, "grad_norm": 2.547339916229248, "learning_rate": 3.56e-05, "loss": 12.3617, "num_input_tokens_seen": 35776, "step": 90 }, { "epoch": 0.76, "grad_norm": 2.478236675262451, "learning_rate": 3.76e-05, "loss": 12.2163, "num_input_tokens_seen": 37472, "step": 95 }, { "epoch": 0.8, "grad_norm": 2.3510525226593018, "learning_rate": 3.960000000000001e-05, "loss": 12.0549, "num_input_tokens_seen": 39328, "step": 100 }, { "epoch": 0.84, "grad_norm": 2.5149877071380615, "learning_rate": 4.16e-05, "loss": 11.9157, "num_input_tokens_seen": 41280, "step": 105 }, { "epoch": 0.88, "grad_norm": 2.451765537261963, "learning_rate": 4.36e-05, "loss": 11.5939, "num_input_tokens_seen": 43552, "step": 110 }, { "epoch": 0.92, "grad_norm": 2.4347667694091797, "learning_rate": 4.5600000000000004e-05, "loss": 10.6461, "num_input_tokens_seen": 45216, "step": 115 }, { "epoch": 0.96, "grad_norm": 2.257194995880127, "learning_rate": 4.76e-05, "loss": 10.384, "num_input_tokens_seen": 47360, "step": 120 }, { "epoch": 1.0, "grad_norm": 2.9155919551849365, "learning_rate": 4.96e-05, "loss": 9.9251, "num_input_tokens_seen": 49376, "step": 125 }, { "epoch": 1.008, "eval_loss": 9.737223625183105, "eval_runtime": 0.8536, "eval_samples_per_second": 65.608, "eval_steps_per_second": 16.402, "num_input_tokens_seen": 49696, "step": 126 }, { "epoch": 1.04, "grad_norm": 2.394456386566162, "learning_rate": 4.9998440375027166e-05, "loss": 9.5781, "num_input_tokens_seen": 51200, "step": 130 }, { "epoch": 1.08, "grad_norm": 2.3213348388671875, "learning_rate": 4.99921047320825e-05, "loss": 8.9157, "num_input_tokens_seen": 53216, "step": 135 }, { "epoch": 1.12, "grad_norm": 2.2454257011413574, "learning_rate": 4.998089682880117e-05, "loss": 8.0389, "num_input_tokens_seen": 55168, "step": 140 }, { "epoch": 1.16, "grad_norm": 2.323469400405884, "learning_rate": 4.9964818850186135e-05, "loss": 8.6052, "num_input_tokens_seen": 56960, "step": 145 }, { "epoch": 1.2, "grad_norm": 2.213332414627075, "learning_rate": 4.994387393067117e-05, "loss": 7.8179, "num_input_tokens_seen": 58880, "step": 150 }, { "epoch": 1.24, "grad_norm": 1.9772940874099731, "learning_rate": 4.9918066153509834e-05, "loss": 7.3245, "num_input_tokens_seen": 60672, "step": 155 }, { "epoch": 1.28, "grad_norm": 2.1031250953674316, "learning_rate": 4.988740054997943e-05, "loss": 7.6592, "num_input_tokens_seen": 62848, "step": 160 }, { "epoch": 1.32, "grad_norm": 1.5588639974594116, "learning_rate": 4.985188309840012e-05, "loss": 6.5144, "num_input_tokens_seen": 64448, "step": 165 }, { "epoch": 1.3599999999999999, "grad_norm": 1.6640610694885254, "learning_rate": 4.9811520722969465e-05, "loss": 6.9579, "num_input_tokens_seen": 66368, "step": 170 }, { "epoch": 1.4, "grad_norm": 1.4669989347457886, "learning_rate": 4.976632129241252e-05, "loss": 5.7934, "num_input_tokens_seen": 68128, "step": 175 }, { "epoch": 1.44, "grad_norm": 1.5640122890472412, "learning_rate": 4.971629361844785e-05, "loss": 6.0098, "num_input_tokens_seen": 70112, "step": 180 }, { "epoch": 1.48, "grad_norm": 1.9767147302627563, "learning_rate": 4.966144745406961e-05, "loss": 6.0466, "num_input_tokens_seen": 72384, "step": 185 }, { "epoch": 1.512, "eval_loss": 5.308993816375732, "eval_runtime": 0.8536, "eval_samples_per_second": 65.601, "eval_steps_per_second": 16.4, "num_input_tokens_seen": 74112, "step": 189 }, { "epoch": 1.52, "grad_norm": 1.4375022649765015, "learning_rate": 4.960179349164621e-05, "loss": 5.7175, "num_input_tokens_seen": 74752, "step": 190 }, { "epoch": 1.56, "grad_norm": 1.8162568807601929, "learning_rate": 4.953734336083583e-05, "loss": 5.0706, "num_input_tokens_seen": 76640, "step": 195 }, { "epoch": 1.6, "grad_norm": 2.0760273933410645, "learning_rate": 4.946810962631916e-05, "loss": 5.3219, "num_input_tokens_seen": 78784, "step": 200 }, { "epoch": 1.6400000000000001, "grad_norm": 1.7913469076156616, "learning_rate": 4.9394105785349944e-05, "loss": 4.6854, "num_input_tokens_seen": 80768, "step": 205 }, { "epoch": 1.6800000000000002, "grad_norm": 1.471130609512329, "learning_rate": 4.9315346265123594e-05, "loss": 4.0843, "num_input_tokens_seen": 82848, "step": 210 }, { "epoch": 1.72, "grad_norm": 1.8904441595077515, "learning_rate": 4.923184641996463e-05, "loss": 4.0373, "num_input_tokens_seen": 84768, "step": 215 }, { "epoch": 1.76, "grad_norm": 2.13264536857605, "learning_rate": 4.914362252833332e-05, "loss": 3.6419, "num_input_tokens_seen": 86848, "step": 220 }, { "epoch": 1.8, "grad_norm": 1.7601622343063354, "learning_rate": 4.905069178965215e-05, "loss": 3.3145, "num_input_tokens_seen": 88736, "step": 225 }, { "epoch": 1.8399999999999999, "grad_norm": 1.8561395406723022, "learning_rate": 4.8953072320952745e-05, "loss": 3.4327, "num_input_tokens_seen": 90848, "step": 230 }, { "epoch": 1.88, "grad_norm": 1.9273947477340698, "learning_rate": 4.885078315334395e-05, "loss": 3.2526, "num_input_tokens_seen": 92992, "step": 235 }, { "epoch": 1.92, "grad_norm": 2.0955724716186523, "learning_rate": 4.874384422830167e-05, "loss": 2.791, "num_input_tokens_seen": 94880, "step": 240 }, { "epoch": 1.96, "grad_norm": 1.2840852737426758, "learning_rate": 4.863227639378124e-05, "loss": 2.2217, "num_input_tokens_seen": 96704, "step": 245 }, { "epoch": 2.0, "grad_norm": 1.7399126291275024, "learning_rate": 4.851610140015304e-05, "loss": 1.6788, "num_input_tokens_seen": 98240, "step": 250 }, { "epoch": 2.016, "eval_loss": 1.980229139328003, "eval_runtime": 0.8532, "eval_samples_per_second": 65.632, "eval_steps_per_second": 16.408, "num_input_tokens_seen": 99136, "step": 252 }, { "epoch": 2.04, "grad_norm": 1.2386776208877563, "learning_rate": 4.839534189596228e-05, "loss": 2.0365, "num_input_tokens_seen": 100224, "step": 255 }, { "epoch": 2.08, "grad_norm": 1.1297599077224731, "learning_rate": 4.8270021423513554e-05, "loss": 1.4581, "num_input_tokens_seen": 101920, "step": 260 }, { "epoch": 2.12, "grad_norm": 1.090529203414917, "learning_rate": 4.8140164414281306e-05, "loss": 1.2941, "num_input_tokens_seen": 103808, "step": 265 }, { "epoch": 2.16, "grad_norm": 1.0772465467453003, "learning_rate": 4.800579618414676e-05, "loss": 1.4016, "num_input_tokens_seen": 105920, "step": 270 }, { "epoch": 2.2, "grad_norm": 1.179438591003418, "learning_rate": 4.7866942928462625e-05, "loss": 1.5631, "num_input_tokens_seen": 108160, "step": 275 }, { "epoch": 2.24, "grad_norm": 1.1663882732391357, "learning_rate": 4.772363171694622e-05, "loss": 0.856, "num_input_tokens_seen": 109920, "step": 280 }, { "epoch": 2.2800000000000002, "grad_norm": 1.3647843599319458, "learning_rate": 4.7575890488402185e-05, "loss": 1.1216, "num_input_tokens_seen": 111904, "step": 285 }, { "epoch": 2.32, "grad_norm": 1.4680769443511963, "learning_rate": 4.742374804527575e-05, "loss": 0.776, "num_input_tokens_seen": 113632, "step": 290 }, { "epoch": 2.36, "grad_norm": 0.9575800895690918, "learning_rate": 4.7267234048037664e-05, "loss": 1.0421, "num_input_tokens_seen": 115616, "step": 295 }, { "epoch": 2.4, "grad_norm": 0.7263527512550354, "learning_rate": 4.710637900940181e-05, "loss": 0.7994, "num_input_tokens_seen": 117472, "step": 300 }, { "epoch": 2.44, "grad_norm": 1.1039828062057495, "learning_rate": 4.694121428837668e-05, "loss": 1.0609, "num_input_tokens_seen": 119616, "step": 305 }, { "epoch": 2.48, "grad_norm": 0.9919302463531494, "learning_rate": 4.6771772084151885e-05, "loss": 0.8039, "num_input_tokens_seen": 121568, "step": 310 }, { "epoch": 2.52, "grad_norm": 1.072923183441162, "learning_rate": 4.659808542982088e-05, "loss": 1.0818, "num_input_tokens_seen": 123904, "step": 315 }, { "epoch": 2.52, "eval_loss": 0.7967647314071655, "eval_runtime": 0.8522, "eval_samples_per_second": 65.715, "eval_steps_per_second": 16.429, "num_input_tokens_seen": 123904, "step": 315 }, { "epoch": 2.56, "grad_norm": 0.6072083711624146, "learning_rate": 4.642018818594107e-05, "loss": 0.5327, "num_input_tokens_seen": 125696, "step": 320 }, { "epoch": 2.6, "grad_norm": 0.9770075678825378, "learning_rate": 4.6238115033932636e-05, "loss": 0.6088, "num_input_tokens_seen": 127488, "step": 325 }, { "epoch": 2.64, "grad_norm": 1.5782830715179443, "learning_rate": 4.605190146931731e-05, "loss": 0.7891, "num_input_tokens_seen": 129632, "step": 330 }, { "epoch": 2.68, "grad_norm": 1.686546802520752, "learning_rate": 4.586158379479848e-05, "loss": 0.7432, "num_input_tokens_seen": 131680, "step": 335 }, { "epoch": 2.7199999999999998, "grad_norm": 0.8748849630355835, "learning_rate": 4.566719911318389e-05, "loss": 0.5091, "num_input_tokens_seen": 133472, "step": 340 }, { "epoch": 2.76, "grad_norm": 0.690437912940979, "learning_rate": 4.5468785320152365e-05, "loss": 0.5338, "num_input_tokens_seen": 135200, "step": 345 }, { "epoch": 2.8, "grad_norm": 1.1224329471588135, "learning_rate": 4.5266381096866e-05, "loss": 0.9588, "num_input_tokens_seen": 137536, "step": 350 }, { "epoch": 2.84, "grad_norm": 0.998359203338623, "learning_rate": 4.5060025902429174e-05, "loss": 0.7235, "num_input_tokens_seen": 139744, "step": 355 }, { "epoch": 2.88, "grad_norm": 1.1624242067337036, "learning_rate": 4.484975996619589e-05, "loss": 0.703, "num_input_tokens_seen": 141760, "step": 360 }, { "epoch": 2.92, "grad_norm": 1.1226049661636353, "learning_rate": 4.4635624279927044e-05, "loss": 0.6324, "num_input_tokens_seen": 143872, "step": 365 }, { "epoch": 2.96, "grad_norm": 0.6500603556632996, "learning_rate": 4.441766058979898e-05, "loss": 0.5377, "num_input_tokens_seen": 145856, "step": 370 }, { "epoch": 3.0, "grad_norm": 1.9589425325393677, "learning_rate": 4.4195911388264946e-05, "loss": 0.8394, "num_input_tokens_seen": 147648, "step": 375 }, { "epoch": 3.024, "eval_loss": 0.5601035952568054, "eval_runtime": 0.8544, "eval_samples_per_second": 65.545, "eval_steps_per_second": 16.386, "num_input_tokens_seen": 148736, "step": 378 }, { "epoch": 3.04, "grad_norm": 1.4394139051437378, "learning_rate": 4.3970419905771145e-05, "loss": 0.4311, "num_input_tokens_seen": 149472, "step": 380 }, { "epoch": 3.08, "grad_norm": 0.8933354020118713, "learning_rate": 4.374123010232888e-05, "loss": 0.6238, "num_input_tokens_seen": 151552, "step": 385 }, { "epoch": 3.12, "grad_norm": 0.8065023422241211, "learning_rate": 4.350838665894446e-05, "loss": 0.5635, "num_input_tokens_seen": 153568, "step": 390 }, { "epoch": 3.16, "grad_norm": 0.6302071213722229, "learning_rate": 4.3271934968908514e-05, "loss": 0.4754, "num_input_tokens_seen": 155616, "step": 395 }, { "epoch": 3.2, "grad_norm": 0.5202668309211731, "learning_rate": 4.303192112894652e-05, "loss": 0.6315, "num_input_tokens_seen": 157728, "step": 400 }, { "epoch": 3.24, "grad_norm": 0.9103642702102661, "learning_rate": 4.278839193023214e-05, "loss": 0.4821, "num_input_tokens_seen": 159488, "step": 405 }, { "epoch": 3.2800000000000002, "grad_norm": 0.6605204939842224, "learning_rate": 4.254139484926519e-05, "loss": 0.5577, "num_input_tokens_seen": 161600, "step": 410 }, { "epoch": 3.32, "grad_norm": 0.8729458451271057, "learning_rate": 4.2290978038616e-05, "loss": 0.5065, "num_input_tokens_seen": 163712, "step": 415 }, { "epoch": 3.36, "grad_norm": 0.7586544752120972, "learning_rate": 4.2037190317538e-05, "loss": 0.4285, "num_input_tokens_seen": 165536, "step": 420 }, { "epoch": 3.4, "grad_norm": 1.765929102897644, "learning_rate": 4.178008116245024e-05, "loss": 0.5842, "num_input_tokens_seen": 167872, "step": 425 }, { "epoch": 3.44, "grad_norm": 0.9758070111274719, "learning_rate": 4.1519700697291944e-05, "loss": 0.6883, "num_input_tokens_seen": 170112, "step": 430 }, { "epoch": 3.48, "grad_norm": 1.6534411907196045, "learning_rate": 4.125609968375072e-05, "loss": 0.4686, "num_input_tokens_seen": 172000, "step": 435 }, { "epoch": 3.52, "grad_norm": 0.6654028296470642, "learning_rate": 4.098932951136645e-05, "loss": 0.5184, "num_input_tokens_seen": 174016, "step": 440 }, { "epoch": 3.528, "eval_loss": 0.4781542420387268, "eval_runtime": 0.8555, "eval_samples_per_second": 65.46, "eval_steps_per_second": 16.365, "num_input_tokens_seen": 174432, "step": 441 }, { "epoch": 3.56, "grad_norm": 0.7983071208000183, "learning_rate": 4.071944218751282e-05, "loss": 0.4466, "num_input_tokens_seen": 175776, "step": 445 }, { "epoch": 3.6, "grad_norm": 0.9768652319908142, "learning_rate": 4.044649032725836e-05, "loss": 0.5084, "num_input_tokens_seen": 177952, "step": 450 }, { "epoch": 3.64, "grad_norm": 1.1632089614868164, "learning_rate": 4.017052714310906e-05, "loss": 0.3983, "num_input_tokens_seen": 179968, "step": 455 }, { "epoch": 3.68, "grad_norm": 1.7327042818069458, "learning_rate": 3.989160643463445e-05, "loss": 0.4557, "num_input_tokens_seen": 181952, "step": 460 }, { "epoch": 3.7199999999999998, "grad_norm": 1.0546882152557373, "learning_rate": 3.960978257797931e-05, "loss": 0.3364, "num_input_tokens_seen": 183680, "step": 465 }, { "epoch": 3.76, "grad_norm": 0.5163419246673584, "learning_rate": 3.932511051526289e-05, "loss": 0.4238, "num_input_tokens_seen": 185632, "step": 470 }, { "epoch": 3.8, "grad_norm": 1.192272663116455, "learning_rate": 3.903764574386786e-05, "loss": 0.46, "num_input_tokens_seen": 187552, "step": 475 }, { "epoch": 3.84, "grad_norm": 1.180910348892212, "learning_rate": 3.8747444305621e-05, "loss": 0.4533, "num_input_tokens_seen": 189408, "step": 480 }, { "epoch": 3.88, "grad_norm": 0.7053818702697754, "learning_rate": 3.8454562775867684e-05, "loss": 0.4832, "num_input_tokens_seen": 191488, "step": 485 }, { "epoch": 3.92, "grad_norm": 1.434988260269165, "learning_rate": 3.8159058252442446e-05, "loss": 0.4162, "num_input_tokens_seen": 193312, "step": 490 }, { "epoch": 3.96, "grad_norm": 0.8368187546730042, "learning_rate": 3.786098834453766e-05, "loss": 0.4683, "num_input_tokens_seen": 195424, "step": 495 }, { "epoch": 4.0, "grad_norm": 1.403336763381958, "learning_rate": 3.7560411161472456e-05, "loss": 0.3853, "num_input_tokens_seen": 197024, "step": 500 }, { "epoch": 4.032, "eval_loss": 0.46134254336357117, "eval_runtime": 0.8491, "eval_samples_per_second": 65.955, "eval_steps_per_second": 16.489, "num_input_tokens_seen": 198656, "step": 504 }, { "epoch": 4.04, "grad_norm": 0.6058657169342041, "learning_rate": 3.725738530136422e-05, "loss": 0.5412, "num_input_tokens_seen": 199040, "step": 505 }, { "epoch": 4.08, "grad_norm": 1.2126246690750122, "learning_rate": 3.695196983970481e-05, "loss": 0.4867, "num_input_tokens_seen": 200960, "step": 510 }, { "epoch": 4.12, "grad_norm": 0.5591365098953247, "learning_rate": 3.664422431784361e-05, "loss": 0.3728, "num_input_tokens_seen": 203008, "step": 515 }, { "epoch": 4.16, "grad_norm": 1.040564775466919, "learning_rate": 3.633420873137988e-05, "loss": 0.378, "num_input_tokens_seen": 204672, "step": 520 }, { "epoch": 4.2, "grad_norm": 0.4747677743434906, "learning_rate": 3.602198351846647e-05, "loss": 0.4497, "num_input_tokens_seen": 206784, "step": 525 }, { "epoch": 4.24, "grad_norm": 0.8027376532554626, "learning_rate": 3.570760954802726e-05, "loss": 0.3772, "num_input_tokens_seen": 208672, "step": 530 }, { "epoch": 4.28, "grad_norm": 0.7639246582984924, "learning_rate": 3.53911481078907e-05, "loss": 0.4375, "num_input_tokens_seen": 210752, "step": 535 }, { "epoch": 4.32, "grad_norm": 0.7363210320472717, "learning_rate": 3.507266089284157e-05, "loss": 0.5944, "num_input_tokens_seen": 213472, "step": 540 }, { "epoch": 4.36, "grad_norm": 1.636357307434082, "learning_rate": 3.475220999259349e-05, "loss": 0.4546, "num_input_tokens_seen": 215616, "step": 545 }, { "epoch": 4.4, "grad_norm": 0.7487366795539856, "learning_rate": 3.442985787968442e-05, "loss": 0.4207, "num_input_tokens_seen": 217664, "step": 550 }, { "epoch": 4.44, "grad_norm": 0.5726836323738098, "learning_rate": 3.410566739729746e-05, "loss": 0.4204, "num_input_tokens_seen": 219584, "step": 555 }, { "epoch": 4.48, "grad_norm": 0.7676336169242859, "learning_rate": 3.3779701747009504e-05, "loss": 0.4381, "num_input_tokens_seen": 221504, "step": 560 }, { "epoch": 4.52, "grad_norm": 0.8204706907272339, "learning_rate": 3.3452024476469934e-05, "loss": 0.4549, "num_input_tokens_seen": 223424, "step": 565 }, { "epoch": 4.536, "eval_loss": 0.43884095549583435, "eval_runtime": 0.8563, "eval_samples_per_second": 65.4, "eval_steps_per_second": 16.35, "num_input_tokens_seen": 224032, "step": 567 }, { "epoch": 4.5600000000000005, "grad_norm": 1.2742042541503906, "learning_rate": 3.312269946701191e-05, "loss": 0.4388, "num_input_tokens_seen": 225216, "step": 570 }, { "epoch": 4.6, "grad_norm": 1.1421749591827393, "learning_rate": 3.279179092119855e-05, "loss": 0.3681, "num_input_tokens_seen": 227008, "step": 575 }, { "epoch": 4.64, "grad_norm": 1.359496831893921, "learning_rate": 3.245936335030651e-05, "loss": 0.4424, "num_input_tokens_seen": 228736, "step": 580 }, { "epoch": 4.68, "grad_norm": 0.7598556280136108, "learning_rate": 3.21254815617494e-05, "loss": 0.3985, "num_input_tokens_seen": 230240, "step": 585 }, { "epoch": 4.72, "grad_norm": 0.4738464057445526, "learning_rate": 3.179021064644347e-05, "loss": 0.428, "num_input_tokens_seen": 232192, "step": 590 }, { "epoch": 4.76, "grad_norm": 0.8662070035934448, "learning_rate": 3.145361596611795e-05, "loss": 0.416, "num_input_tokens_seen": 234368, "step": 595 }, { "epoch": 4.8, "grad_norm": 0.7484014630317688, "learning_rate": 3.111576314057268e-05, "loss": 0.395, "num_input_tokens_seen": 236032, "step": 600 }, { "epoch": 4.84, "grad_norm": 0.9487155079841614, "learning_rate": 3.0776718034885454e-05, "loss": 0.3691, "num_input_tokens_seen": 237920, "step": 605 }, { "epoch": 4.88, "grad_norm": 0.6988774538040161, "learning_rate": 3.0436546746571372e-05, "loss": 0.3724, "num_input_tokens_seen": 239680, "step": 610 }, { "epoch": 4.92, "grad_norm": 0.8400317430496216, "learning_rate": 3.0095315592697126e-05, "loss": 0.3814, "num_input_tokens_seen": 241504, "step": 615 }, { "epoch": 4.96, "grad_norm": 0.9846633672714233, "learning_rate": 2.9753091096952255e-05, "loss": 0.4676, "num_input_tokens_seen": 243584, "step": 620 }, { "epoch": 5.0, "grad_norm": 0.6307002902030945, "learning_rate": 2.9409939976680313e-05, "loss": 0.4232, "num_input_tokens_seen": 245472, "step": 625 }, { "epoch": 5.04, "grad_norm": 1.2824044227600098, "learning_rate": 2.9065929129872094e-05, "loss": 0.4193, "num_input_tokens_seen": 247424, "step": 630 }, { "epoch": 5.04, "eval_loss": 0.42154452204704285, "eval_runtime": 0.8526, "eval_samples_per_second": 65.682, "eval_steps_per_second": 16.42, "num_input_tokens_seen": 247424, "step": 630 }, { "epoch": 5.08, "grad_norm": 0.6836050152778625, "learning_rate": 2.8721125622123806e-05, "loss": 0.3778, "num_input_tokens_seen": 249472, "step": 635 }, { "epoch": 5.12, "grad_norm": 1.2711609601974487, "learning_rate": 2.8375596673562482e-05, "loss": 0.3189, "num_input_tokens_seen": 251296, "step": 640 }, { "epoch": 5.16, "grad_norm": 1.1848688125610352, "learning_rate": 2.8029409645741267e-05, "loss": 0.3988, "num_input_tokens_seen": 253344, "step": 645 }, { "epoch": 5.2, "grad_norm": 0.43571653962135315, "learning_rate": 2.7682632028507167e-05, "loss": 0.3687, "num_input_tokens_seen": 255104, "step": 650 }, { "epoch": 5.24, "grad_norm": 2.1147029399871826, "learning_rate": 2.733533142684377e-05, "loss": 0.3907, "num_input_tokens_seen": 256832, "step": 655 }, { "epoch": 5.28, "grad_norm": 0.5585914850234985, "learning_rate": 2.6987575547691497e-05, "loss": 0.4176, "num_input_tokens_seen": 258720, "step": 660 }, { "epoch": 5.32, "grad_norm": 1.013206124305725, "learning_rate": 2.6639432186748043e-05, "loss": 0.4127, "num_input_tokens_seen": 260576, "step": 665 }, { "epoch": 5.36, "grad_norm": 0.6060461401939392, "learning_rate": 2.6290969215251416e-05, "loss": 0.3817, "num_input_tokens_seen": 262368, "step": 670 }, { "epoch": 5.4, "grad_norm": 0.9579009413719177, "learning_rate": 2.594225456674837e-05, "loss": 0.3826, "num_input_tokens_seen": 264320, "step": 675 }, { "epoch": 5.44, "grad_norm": 1.1483041048049927, "learning_rate": 2.559335622385055e-05, "loss": 0.3825, "num_input_tokens_seen": 266304, "step": 680 }, { "epoch": 5.48, "grad_norm": 0.8167299628257751, "learning_rate": 2.524434220498123e-05, "loss": 0.3944, "num_input_tokens_seen": 268384, "step": 685 }, { "epoch": 5.52, "grad_norm": 0.5977395176887512, "learning_rate": 2.4895280551114907e-05, "loss": 0.3691, "num_input_tokens_seen": 270208, "step": 690 }, { "epoch": 5.5440000000000005, "eval_loss": 0.4073176383972168, "eval_runtime": 0.8504, "eval_samples_per_second": 65.848, "eval_steps_per_second": 16.462, "num_input_tokens_seen": 271232, "step": 693 }, { "epoch": 5.5600000000000005, "grad_norm": 1.8195998668670654, "learning_rate": 2.4546239312512635e-05, "loss": 0.4443, "num_input_tokens_seen": 271840, "step": 695 }, { "epoch": 5.6, "grad_norm": 1.045985460281372, "learning_rate": 2.4197286535455464e-05, "loss": 0.502, "num_input_tokens_seen": 273888, "step": 700 }, { "epoch": 5.64, "grad_norm": 1.0764570236206055, "learning_rate": 2.384849024897869e-05, "loss": 0.4579, "num_input_tokens_seen": 275904, "step": 705 }, { "epoch": 5.68, "grad_norm": 0.2045237421989441, "learning_rate": 2.349991845160949e-05, "loss": 0.4459, "num_input_tokens_seen": 277888, "step": 710 }, { "epoch": 5.72, "grad_norm": 1.0971755981445312, "learning_rate": 2.3151639098110377e-05, "loss": 0.4206, "num_input_tokens_seen": 279872, "step": 715 }, { "epoch": 5.76, "grad_norm": 0.39166244864463806, "learning_rate": 2.280372008623142e-05, "loss": 0.3705, "num_input_tokens_seen": 281664, "step": 720 }, { "epoch": 5.8, "grad_norm": 0.7744928002357483, "learning_rate": 2.2456229243473345e-05, "loss": 0.3991, "num_input_tokens_seen": 283776, "step": 725 }, { "epoch": 5.84, "grad_norm": 0.4687577486038208, "learning_rate": 2.2109234313864465e-05, "loss": 0.3788, "num_input_tokens_seen": 285568, "step": 730 }, { "epoch": 5.88, "grad_norm": 0.439230352640152, "learning_rate": 2.176280294475383e-05, "loss": 0.3795, "num_input_tokens_seen": 287360, "step": 735 }, { "epoch": 5.92, "grad_norm": 0.6617324948310852, "learning_rate": 2.1417002673623264e-05, "loss": 0.4557, "num_input_tokens_seen": 289632, "step": 740 }, { "epoch": 5.96, "grad_norm": 0.32485124468803406, "learning_rate": 2.1071900914920816e-05, "loss": 0.4056, "num_input_tokens_seen": 291552, "step": 745 }, { "epoch": 6.0, "grad_norm": 1.3294658660888672, "learning_rate": 2.0727564946918087e-05, "loss": 0.4079, "num_input_tokens_seen": 293616, "step": 750 }, { "epoch": 6.04, "grad_norm": 0.7842015027999878, "learning_rate": 2.038406189859433e-05, "loss": 0.3746, "num_input_tokens_seen": 295440, "step": 755 }, { "epoch": 6.048, "eval_loss": 0.40050727128982544, "eval_runtime": 0.8608, "eval_samples_per_second": 65.059, "eval_steps_per_second": 16.265, "num_input_tokens_seen": 295728, "step": 756 }, { "epoch": 6.08, "grad_norm": 0.7505316734313965, "learning_rate": 2.004145873654942e-05, "loss": 0.4006, "num_input_tokens_seen": 297360, "step": 760 }, { "epoch": 6.12, "grad_norm": 0.787607729434967, "learning_rate": 1.969982225194864e-05, "loss": 0.3928, "num_input_tokens_seen": 299312, "step": 765 }, { "epoch": 6.16, "grad_norm": 0.8527250289916992, "learning_rate": 1.9359219047501565e-05, "loss": 0.3839, "num_input_tokens_seen": 301488, "step": 770 }, { "epoch": 6.2, "grad_norm": 0.8097345232963562, "learning_rate": 1.9019715524477767e-05, "loss": 0.4211, "num_input_tokens_seen": 303696, "step": 775 }, { "epoch": 6.24, "grad_norm": 1.303476095199585, "learning_rate": 1.868137786976177e-05, "loss": 0.3858, "num_input_tokens_seen": 305360, "step": 780 }, { "epoch": 6.28, "grad_norm": 0.7829755544662476, "learning_rate": 1.8344272042949724e-05, "loss": 0.3779, "num_input_tokens_seen": 307408, "step": 785 }, { "epoch": 6.32, "grad_norm": 0.659096360206604, "learning_rate": 1.800846376349051e-05, "loss": 0.4222, "num_input_tokens_seen": 309232, "step": 790 }, { "epoch": 6.36, "grad_norm": 0.6200412511825562, "learning_rate": 1.767401849787357e-05, "loss": 0.3675, "num_input_tokens_seen": 311184, "step": 795 }, { "epoch": 6.4, "grad_norm": 0.5834780335426331, "learning_rate": 1.73410014468661e-05, "loss": 0.3727, "num_input_tokens_seen": 313072, "step": 800 }, { "epoch": 6.44, "grad_norm": 1.6976076364517212, "learning_rate": 1.7009477532802054e-05, "loss": 0.3823, "num_input_tokens_seen": 314832, "step": 805 }, { "epoch": 6.48, "grad_norm": 1.1492793560028076, "learning_rate": 1.6679511386925337e-05, "loss": 0.3356, "num_input_tokens_seen": 316560, "step": 810 }, { "epoch": 6.52, "grad_norm": 0.6304459571838379, "learning_rate": 1.635116733678988e-05, "loss": 0.427, "num_input_tokens_seen": 318960, "step": 815 }, { "epoch": 6.552, "eval_loss": 0.40054336190223694, "eval_runtime": 0.8527, "eval_samples_per_second": 65.674, "eval_steps_per_second": 16.418, "num_input_tokens_seen": 320464, "step": 819 }, { "epoch": 6.5600000000000005, "grad_norm": 0.825368344783783, "learning_rate": 1.6024509393718844e-05, "loss": 0.3747, "num_input_tokens_seen": 320880, "step": 820 }, { "epoch": 6.6, "grad_norm": 0.5931684970855713, "learning_rate": 1.5699601240325474e-05, "loss": 0.3936, "num_input_tokens_seen": 323184, "step": 825 }, { "epoch": 6.64, "grad_norm": 0.8659843802452087, "learning_rate": 1.5376506218098015e-05, "loss": 0.371, "num_input_tokens_seen": 325168, "step": 830 }, { "epoch": 6.68, "grad_norm": 0.6564781069755554, "learning_rate": 1.505528731505126e-05, "loss": 0.3795, "num_input_tokens_seen": 326992, "step": 835 }, { "epoch": 6.72, "grad_norm": 0.9912712574005127, "learning_rate": 1.4736007153446801e-05, "loss": 0.4107, "num_input_tokens_seen": 329104, "step": 840 }, { "epoch": 6.76, "grad_norm": 0.7603425979614258, "learning_rate": 1.4418727977584774e-05, "loss": 0.3653, "num_input_tokens_seen": 331088, "step": 845 }, { "epoch": 6.8, "grad_norm": 0.5329681038856506, "learning_rate": 1.4103511641669152e-05, "loss": 0.3939, "num_input_tokens_seen": 333008, "step": 850 }, { "epoch": 6.84, "grad_norm": 0.5622363686561584, "learning_rate": 1.3790419597749199e-05, "loss": 0.3725, "num_input_tokens_seen": 335024, "step": 855 }, { "epoch": 6.88, "grad_norm": 0.9950863718986511, "learning_rate": 1.3479512883739232e-05, "loss": 0.4179, "num_input_tokens_seen": 337104, "step": 860 }, { "epoch": 6.92, "grad_norm": 0.7129436135292053, "learning_rate": 1.3170852111519175e-05, "loss": 0.3773, "num_input_tokens_seen": 338960, "step": 865 }, { "epoch": 6.96, "grad_norm": 1.585489273071289, "learning_rate": 1.2864497455118152e-05, "loss": 0.3702, "num_input_tokens_seen": 340848, "step": 870 }, { "epoch": 7.0, "grad_norm": 1.2564765214920044, "learning_rate": 1.2560508638983437e-05, "loss": 0.3594, "num_input_tokens_seen": 343040, "step": 875 }, { "epoch": 7.04, "grad_norm": 0.6992194056510925, "learning_rate": 1.2258944926337057e-05, "loss": 0.3347, "num_input_tokens_seen": 345056, "step": 880 }, { "epoch": 7.056, "eval_loss": 0.41065576672554016, "eval_runtime": 0.8558, "eval_samples_per_second": 65.435, "eval_steps_per_second": 16.359, "num_input_tokens_seen": 345856, "step": 882 }, { "epoch": 7.08, "grad_norm": 0.7301174402236938, "learning_rate": 1.1959865107622307e-05, "loss": 0.3597, "num_input_tokens_seen": 347232, "step": 885 }, { "epoch": 7.12, "grad_norm": 1.776132345199585, "learning_rate": 1.1663327489042435e-05, "loss": 0.4383, "num_input_tokens_seen": 349504, "step": 890 }, { "epoch": 7.16, "grad_norm": 0.7216264605522156, "learning_rate": 1.1369389881193749e-05, "loss": 0.4164, "num_input_tokens_seen": 351296, "step": 895 }, { "epoch": 7.2, "grad_norm": 1.4257539510726929, "learning_rate": 1.107810958779531e-05, "loss": 0.3858, "num_input_tokens_seen": 353248, "step": 900 }, { "epoch": 7.24, "grad_norm": 1.8053635358810425, "learning_rate": 1.0789543394517435e-05, "loss": 0.4069, "num_input_tokens_seen": 355232, "step": 905 }, { "epoch": 7.28, "grad_norm": 0.6325796246528625, "learning_rate": 1.050374755791127e-05, "loss": 0.3591, "num_input_tokens_seen": 357376, "step": 910 }, { "epoch": 7.32, "grad_norm": 1.1207585334777832, "learning_rate": 1.022077779444145e-05, "loss": 0.3539, "num_input_tokens_seen": 359232, "step": 915 }, { "epoch": 7.36, "grad_norm": 1.4070286750793457, "learning_rate": 9.94068926962404e-06, "loss": 0.3527, "num_input_tokens_seen": 361024, "step": 920 }, { "epoch": 7.4, "grad_norm": 0.627964973449707, "learning_rate": 9.663536587271902e-06, "loss": 0.376, "num_input_tokens_seen": 363200, "step": 925 }, { "epoch": 7.44, "grad_norm": 0.6566188335418701, "learning_rate": 9.389373778849612e-06, "loss": 0.3517, "num_input_tokens_seen": 365152, "step": 930 }, { "epoch": 7.48, "grad_norm": 0.6392218470573425, "learning_rate": 9.11825429293989e-06, "loss": 0.4175, "num_input_tokens_seen": 367328, "step": 935 }, { "epoch": 7.52, "grad_norm": 0.5262883901596069, "learning_rate": 8.850230984823735e-06, "loss": 0.358, "num_input_tokens_seen": 369248, "step": 940 }, { "epoch": 7.5600000000000005, "grad_norm": 0.9610680937767029, "learning_rate": 8.585356106176094e-06, "loss": 0.331, "num_input_tokens_seen": 371040, "step": 945 }, { "epoch": 7.5600000000000005, "eval_loss": 0.4088888466358185, "eval_runtime": 0.8481, "eval_samples_per_second": 66.028, "eval_steps_per_second": 16.507, "num_input_tokens_seen": 371040, "step": 945 }, { "epoch": 7.6, "grad_norm": 0.7290608882904053, "learning_rate": 8.323681294879394e-06, "loss": 0.4139, "num_input_tokens_seen": 372928, "step": 950 }, { "epoch": 7.64, "grad_norm": 0.532922089099884, "learning_rate": 8.06525756495657e-06, "loss": 0.4047, "num_input_tokens_seen": 374816, "step": 955 }, { "epoch": 7.68, "grad_norm": 0.77403724193573, "learning_rate": 7.810135296625818e-06, "loss": 0.3748, "num_input_tokens_seen": 376704, "step": 960 }, { "epoch": 7.72, "grad_norm": 0.5218887329101562, "learning_rate": 7.558364226478842e-06, "loss": 0.3622, "num_input_tokens_seen": 378624, "step": 965 }, { "epoch": 7.76, "grad_norm": 0.5533618927001953, "learning_rate": 7.309993437784624e-06, "loss": 0.3991, "num_input_tokens_seen": 380480, "step": 970 }, { "epoch": 7.8, "grad_norm": 0.6465098261833191, "learning_rate": 7.065071350920538e-06, "loss": 0.3731, "num_input_tokens_seen": 382144, "step": 975 }, { "epoch": 7.84, "grad_norm": 0.6888776421546936, "learning_rate": 6.823645713932708e-06, "loss": 0.4033, "num_input_tokens_seen": 384320, "step": 980 }, { "epoch": 7.88, "grad_norm": 0.6575106382369995, "learning_rate": 6.58576359322742e-06, "loss": 0.4272, "num_input_tokens_seen": 386656, "step": 985 }, { "epoch": 7.92, "grad_norm": 0.4115797281265259, "learning_rate": 6.3514713643954475e-06, "loss": 0.3612, "num_input_tokens_seen": 388672, "step": 990 }, { "epoch": 7.96, "grad_norm": 1.37602698802948, "learning_rate": 6.120814703171024e-06, "loss": 0.3805, "num_input_tokens_seen": 390400, "step": 995 }, { "epoch": 8.0, "grad_norm": 0.3773898780345917, "learning_rate": 5.893838576527275e-06, "loss": 0.3604, "num_input_tokens_seen": 392080, "step": 1000 }, { "epoch": 8.04, "grad_norm": 0.7060356140136719, "learning_rate": 5.6705872339098186e-06, "loss": 0.4144, "num_input_tokens_seen": 394160, "step": 1005 }, { "epoch": 8.064, "eval_loss": 0.38490504026412964, "eval_runtime": 0.8524, "eval_samples_per_second": 65.693, "eval_steps_per_second": 16.423, "num_input_tokens_seen": 395216, "step": 1008 }, { "epoch": 8.08, "grad_norm": 0.7593961954116821, "learning_rate": 5.451104198610249e-06, "loss": 0.3568, "num_input_tokens_seen": 395888, "step": 1010 }, { "epoch": 8.12, "grad_norm": 1.2459512948989868, "learning_rate": 5.235432259281175e-06, "loss": 0.3882, "num_input_tokens_seen": 398032, "step": 1015 }, { "epoch": 8.16, "grad_norm": 1.0449175834655762, "learning_rate": 5.023613461594512e-06, "loss": 0.3798, "num_input_tokens_seen": 399856, "step": 1020 }, { "epoch": 8.2, "grad_norm": 0.7643663287162781, "learning_rate": 4.8156891000445406e-06, "loss": 0.3718, "num_input_tokens_seen": 401616, "step": 1025 }, { "epoch": 8.24, "grad_norm": 0.7826510071754456, "learning_rate": 4.6116997098975465e-06, "loss": 0.3458, "num_input_tokens_seen": 403568, "step": 1030 }, { "epoch": 8.28, "grad_norm": 0.6359825134277344, "learning_rate": 4.411685059289314e-06, "loss": 0.3725, "num_input_tokens_seen": 405712, "step": 1035 }, { "epoch": 8.32, "grad_norm": 0.4394562244415283, "learning_rate": 4.215684141472292e-06, "loss": 0.3745, "num_input_tokens_seen": 407888, "step": 1040 }, { "epoch": 8.36, "grad_norm": 0.6596431732177734, "learning_rate": 4.023735167213752e-06, "loss": 0.3587, "num_input_tokens_seen": 409712, "step": 1045 }, { "epoch": 8.4, "grad_norm": 0.9594627618789673, "learning_rate": 3.835875557346552e-06, "loss": 0.3633, "num_input_tokens_seen": 411504, "step": 1050 }, { "epoch": 8.44, "grad_norm": 0.5970214009284973, "learning_rate": 3.6521419354738738e-06, "loss": 0.368, "num_input_tokens_seen": 413168, "step": 1055 }, { "epoch": 8.48, "grad_norm": 0.4645940959453583, "learning_rate": 3.4725701208293435e-06, "loss": 0.3561, "num_input_tokens_seen": 414960, "step": 1060 }, { "epoch": 8.52, "grad_norm": 0.6897419691085815, "learning_rate": 3.297195121294022e-06, "loss": 0.3644, "num_input_tokens_seen": 416880, "step": 1065 }, { "epoch": 8.56, "grad_norm": 0.8192169666290283, "learning_rate": 3.126051126571561e-06, "loss": 0.3779, "num_input_tokens_seen": 418768, "step": 1070 }, { "epoch": 8.568, "eval_loss": 0.38693496584892273, "eval_runtime": 0.8505, "eval_samples_per_second": 65.843, "eval_steps_per_second": 16.461, "num_input_tokens_seen": 419184, "step": 1071 }, { "epoch": 8.6, "grad_norm": 0.9851546287536621, "learning_rate": 2.9591715015228284e-06, "loss": 0.4053, "num_input_tokens_seen": 421008, "step": 1075 }, { "epoch": 8.64, "grad_norm": 0.5261719226837158, "learning_rate": 2.7965887796613884e-06, "loss": 0.3589, "num_input_tokens_seen": 422864, "step": 1080 }, { "epoch": 8.68, "grad_norm": 0.7083816528320312, "learning_rate": 2.6383346568110062e-06, "loss": 0.3792, "num_input_tokens_seen": 424976, "step": 1085 }, { "epoch": 8.72, "grad_norm": 0.8124401569366455, "learning_rate": 2.4844399849264928e-06, "loss": 0.3899, "num_input_tokens_seen": 427120, "step": 1090 }, { "epoch": 8.76, "grad_norm": 0.7263258695602417, "learning_rate": 2.3349347660790582e-06, "loss": 0.4017, "num_input_tokens_seen": 429264, "step": 1095 }, { "epoch": 8.8, "grad_norm": 0.6860739588737488, "learning_rate": 2.189848146607348e-06, "loss": 0.3718, "num_input_tokens_seen": 431056, "step": 1100 }, { "epoch": 8.84, "grad_norm": 0.49945640563964844, "learning_rate": 2.0492084114352965e-06, "loss": 0.3644, "num_input_tokens_seen": 433136, "step": 1105 }, { "epoch": 8.88, "grad_norm": 0.8145077228546143, "learning_rate": 1.913042978557944e-06, "loss": 0.3859, "num_input_tokens_seen": 435216, "step": 1110 }, { "epoch": 8.92, "grad_norm": 1.1362378597259521, "learning_rate": 1.7813783936962258e-06, "loss": 0.3521, "num_input_tokens_seen": 437040, "step": 1115 }, { "epoch": 8.96, "grad_norm": 1.012280821800232, "learning_rate": 1.654240325121831e-06, "loss": 0.3873, "num_input_tokens_seen": 439088, "step": 1120 }, { "epoch": 9.0, "grad_norm": 1.0121548175811768, "learning_rate": 1.5316535586531483e-06, "loss": 0.4015, "num_input_tokens_seen": 440848, "step": 1125 }, { "epoch": 9.04, "grad_norm": 0.9955313205718994, "learning_rate": 1.4136419928231892e-06, "loss": 0.3714, "num_input_tokens_seen": 442864, "step": 1130 }, { "epoch": 9.072, "eval_loss": 0.38989585638046265, "eval_runtime": 0.8537, "eval_samples_per_second": 65.599, "eval_steps_per_second": 16.4, "num_input_tokens_seen": 444560, "step": 1134 }, { "epoch": 9.08, "grad_norm": 0.6308397054672241, "learning_rate": 1.3002286342205462e-06, "loss": 0.3985, "num_input_tokens_seen": 445040, "step": 1135 }, { "epoch": 9.12, "grad_norm": 0.438340961933136, "learning_rate": 1.1914355930041837e-06, "loss": 0.3583, "num_input_tokens_seen": 446864, "step": 1140 }, { "epoch": 9.16, "grad_norm": 0.80867600440979, "learning_rate": 1.087284078593051e-06, "loss": 0.3824, "num_input_tokens_seen": 449008, "step": 1145 }, { "epoch": 9.2, "grad_norm": 1.2045706510543823, "learning_rate": 9.877943955312552e-07, "loss": 0.3471, "num_input_tokens_seen": 450832, "step": 1150 }, { "epoch": 9.24, "grad_norm": 1.406896948814392, "learning_rate": 8.929859395296364e-07, "loss": 0.3737, "num_input_tokens_seen": 452784, "step": 1155 }, { "epoch": 9.28, "grad_norm": 0.849940836429596, "learning_rate": 8.028771936845342e-07, "loss": 0.3719, "num_input_tokens_seen": 454992, "step": 1160 }, { "epoch": 9.32, "grad_norm": 0.889966607093811, "learning_rate": 7.174857248745004e-07, "loss": 0.3571, "num_input_tokens_seen": 456528, "step": 1165 }, { "epoch": 9.36, "grad_norm": 1.1385633945465088, "learning_rate": 6.368281803355691e-07, "loss": 0.3921, "num_input_tokens_seen": 458768, "step": 1170 }, { "epoch": 9.4, "grad_norm": 0.6559231877326965, "learning_rate": 5.609202844158723e-07, "loss": 0.3935, "num_input_tokens_seen": 460816, "step": 1175 }, { "epoch": 9.44, "grad_norm": 0.8425350785255432, "learning_rate": 4.897768355101084e-07, "loss": 0.3657, "num_input_tokens_seen": 462384, "step": 1180 }, { "epoch": 9.48, "grad_norm": 0.9185330271720886, "learning_rate": 4.234117031746143e-07, "loss": 0.3923, "num_input_tokens_seen": 464304, "step": 1185 }, { "epoch": 9.52, "grad_norm": 0.5886545777320862, "learning_rate": 3.6183782542343057e-07, "loss": 0.3341, "num_input_tokens_seen": 466384, "step": 1190 }, { "epoch": 9.56, "grad_norm": 0.7718898057937622, "learning_rate": 3.050672062060278e-07, "loss": 0.3858, "num_input_tokens_seen": 468368, "step": 1195 }, { "epoch": 9.576, "eval_loss": 0.3861676752567291, "eval_runtime": 0.8505, "eval_samples_per_second": 65.845, "eval_steps_per_second": 16.461, "num_input_tokens_seen": 469104, "step": 1197 }, { "epoch": 9.6, "grad_norm": 1.3115769624710083, "learning_rate": 2.531109130671061e-07, "loss": 0.3761, "num_input_tokens_seen": 470192, "step": 1200 }, { "epoch": 9.64, "grad_norm": 0.625157356262207, "learning_rate": 2.0597907498896007e-07, "loss": 0.3878, "num_input_tokens_seen": 472240, "step": 1205 }, { "epoch": 9.68, "grad_norm": 1.4318360090255737, "learning_rate": 1.6368088041681108e-07, "loss": 0.3748, "num_input_tokens_seen": 474160, "step": 1210 }, { "epoch": 9.72, "grad_norm": 0.62953782081604, "learning_rate": 1.2622457546749567e-07, "loss": 0.4067, "num_input_tokens_seen": 476784, "step": 1215 }, { "epoch": 9.76, "grad_norm": 1.3049143552780151, "learning_rate": 9.361746232188495e-08, "loss": 0.3679, "num_input_tokens_seen": 478864, "step": 1220 }, { "epoch": 9.8, "grad_norm": 0.5870345234870911, "learning_rate": 6.586589780128716e-08, "loss": 0.3603, "num_input_tokens_seen": 480816, "step": 1225 }, { "epoch": 9.84, "grad_norm": 0.5798351764678955, "learning_rate": 4.2975292128200064e-08, "loss": 0.384, "num_input_tokens_seen": 482832, "step": 1230 }, { "epoch": 9.88, "grad_norm": 0.6793395280838013, "learning_rate": 2.4950107871549167e-08, "loss": 0.3827, "num_input_tokens_seen": 484912, "step": 1235 }, { "epoch": 9.92, "grad_norm": 0.9757604002952576, "learning_rate": 1.179385907672248e-08, "loss": 0.3863, "num_input_tokens_seen": 486640, "step": 1240 }, { "epoch": 9.96, "grad_norm": 0.7342190742492676, "learning_rate": 3.5091105804907487e-09, "loss": 0.3537, "num_input_tokens_seen": 488336, "step": 1245 }, { "epoch": 10.0, "grad_norm": 0.7796341180801392, "learning_rate": 9.747751098521107e-11, "loss": 0.3453, "num_input_tokens_seen": 490000, "step": 1250 }, { "epoch": 10.0, "num_input_tokens_seen": 490000, "step": 1250, "total_flos": 2.206447853568e+16, "train_loss": 2.282332957649231, "train_runtime": 172.1756, "train_samples_per_second": 28.924, "train_steps_per_second": 7.26 } ], "logging_steps": 5, "max_steps": 1250, "num_input_tokens_seen": 490000, "num_train_epochs": 10, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.206447853568e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }