{ "best_global_step": 23292, "best_metric": 0.461969256401062, "best_model_checkpoint": "saves_multiple/ia3/llama-3-8b-instruct/train_stsb_101112_1760638041/checkpoint-23292", "epoch": 20.0, "eval_steps": 1294, "global_step": 25880, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0038639876352395673, "grad_norm": 3.748467445373535, "learning_rate": 7.727975270479134e-08, "loss": 4.7506, "num_input_tokens_seen": 1504, "step": 5 }, { "epoch": 0.0077279752704791345, "grad_norm": 6.029379844665527, "learning_rate": 1.7387944358578052e-07, "loss": 5.0007, "num_input_tokens_seen": 2944, "step": 10 }, { "epoch": 0.011591962905718702, "grad_norm": 4.384815216064453, "learning_rate": 2.704791344667697e-07, "loss": 4.9281, "num_input_tokens_seen": 4640, "step": 15 }, { "epoch": 0.015455950540958269, "grad_norm": 3.9707894325256348, "learning_rate": 3.670788253477589e-07, "loss": 4.7618, "num_input_tokens_seen": 6240, "step": 20 }, { "epoch": 0.019319938176197836, "grad_norm": 5.605299472808838, "learning_rate": 4.636785162287481e-07, "loss": 4.6884, "num_input_tokens_seen": 7840, "step": 25 }, { "epoch": 0.023183925811437404, "grad_norm": 3.5143213272094727, "learning_rate": 5.602782071097372e-07, "loss": 4.7746, "num_input_tokens_seen": 9536, "step": 30 }, { "epoch": 0.02704791344667697, "grad_norm": 3.726367712020874, "learning_rate": 6.568778979907265e-07, "loss": 4.7654, "num_input_tokens_seen": 11296, "step": 35 }, { "epoch": 0.030911901081916538, "grad_norm": 5.496340751647949, "learning_rate": 7.534775888717157e-07, "loss": 4.8221, "num_input_tokens_seen": 12960, "step": 40 }, { "epoch": 0.0347758887171561, "grad_norm": 3.9401180744171143, "learning_rate": 8.500772797527049e-07, "loss": 4.8919, "num_input_tokens_seen": 14464, "step": 45 }, { "epoch": 0.03863987635239567, "grad_norm": 5.611064910888672, "learning_rate": 9.466769706336941e-07, "loss": 4.6003, "num_input_tokens_seen": 16032, "step": 50 }, { "epoch": 0.04250386398763524, "grad_norm": 5.250827789306641, "learning_rate": 1.0432766615146832e-06, "loss": 4.9659, "num_input_tokens_seen": 17632, "step": 55 }, { "epoch": 0.04636785162287481, "grad_norm": 3.710026502609253, "learning_rate": 1.1398763523956722e-06, "loss": 4.8684, "num_input_tokens_seen": 19424, "step": 60 }, { "epoch": 0.05023183925811438, "grad_norm": 4.784130573272705, "learning_rate": 1.2364760432766615e-06, "loss": 4.7275, "num_input_tokens_seen": 20896, "step": 65 }, { "epoch": 0.05409582689335394, "grad_norm": 4.878135681152344, "learning_rate": 1.3330757341576507e-06, "loss": 5.0711, "num_input_tokens_seen": 22496, "step": 70 }, { "epoch": 0.05795981452859351, "grad_norm": 3.8020005226135254, "learning_rate": 1.42967542503864e-06, "loss": 5.2515, "num_input_tokens_seen": 24000, "step": 75 }, { "epoch": 0.061823802163833076, "grad_norm": 4.708711624145508, "learning_rate": 1.5262751159196291e-06, "loss": 4.7956, "num_input_tokens_seen": 25632, "step": 80 }, { "epoch": 0.06568778979907264, "grad_norm": 7.441239356994629, "learning_rate": 1.6228748068006183e-06, "loss": 4.928, "num_input_tokens_seen": 27488, "step": 85 }, { "epoch": 0.0695517774343122, "grad_norm": 4.343397617340088, "learning_rate": 1.7194744976816076e-06, "loss": 5.1291, "num_input_tokens_seen": 29056, "step": 90 }, { "epoch": 0.07341576506955177, "grad_norm": 4.1103034019470215, "learning_rate": 1.8160741885625968e-06, "loss": 5.1769, "num_input_tokens_seen": 30656, "step": 95 }, { "epoch": 0.07727975270479134, "grad_norm": 4.113183498382568, "learning_rate": 1.9126738794435858e-06, "loss": 5.0759, "num_input_tokens_seen": 32160, "step": 100 }, { "epoch": 0.08114374034003091, "grad_norm": 4.694586753845215, "learning_rate": 2.009273570324575e-06, "loss": 4.5783, "num_input_tokens_seen": 33920, "step": 105 }, { "epoch": 0.08500772797527048, "grad_norm": 5.141090393066406, "learning_rate": 2.1058732612055642e-06, "loss": 4.9787, "num_input_tokens_seen": 35616, "step": 110 }, { "epoch": 0.08887171561051005, "grad_norm": 5.862782001495361, "learning_rate": 2.2024729520865534e-06, "loss": 4.6217, "num_input_tokens_seen": 37152, "step": 115 }, { "epoch": 0.09273570324574962, "grad_norm": 5.363877296447754, "learning_rate": 2.2990726429675427e-06, "loss": 4.6829, "num_input_tokens_seen": 38912, "step": 120 }, { "epoch": 0.09659969088098919, "grad_norm": 4.728428363800049, "learning_rate": 2.3956723338485315e-06, "loss": 5.0983, "num_input_tokens_seen": 40576, "step": 125 }, { "epoch": 0.10046367851622875, "grad_norm": 5.590007781982422, "learning_rate": 2.4922720247295207e-06, "loss": 5.2787, "num_input_tokens_seen": 42432, "step": 130 }, { "epoch": 0.10432766615146831, "grad_norm": 4.193396091461182, "learning_rate": 2.58887171561051e-06, "loss": 4.8991, "num_input_tokens_seen": 44032, "step": 135 }, { "epoch": 0.10819165378670788, "grad_norm": 4.423731327056885, "learning_rate": 2.685471406491499e-06, "loss": 5.1222, "num_input_tokens_seen": 45792, "step": 140 }, { "epoch": 0.11205564142194745, "grad_norm": 5.441370964050293, "learning_rate": 2.7820710973724883e-06, "loss": 5.1084, "num_input_tokens_seen": 47424, "step": 145 }, { "epoch": 0.11591962905718702, "grad_norm": 6.952087879180908, "learning_rate": 2.8786707882534776e-06, "loss": 4.6194, "num_input_tokens_seen": 49312, "step": 150 }, { "epoch": 0.11978361669242658, "grad_norm": 4.715424060821533, "learning_rate": 2.9752704791344668e-06, "loss": 4.9856, "num_input_tokens_seen": 51072, "step": 155 }, { "epoch": 0.12364760432766615, "grad_norm": 4.42991828918457, "learning_rate": 3.071870170015456e-06, "loss": 4.9817, "num_input_tokens_seen": 52896, "step": 160 }, { "epoch": 0.1275115919629057, "grad_norm": 3.777226209640503, "learning_rate": 3.1684698608964452e-06, "loss": 5.1943, "num_input_tokens_seen": 54272, "step": 165 }, { "epoch": 0.13137557959814528, "grad_norm": 4.71563720703125, "learning_rate": 3.2650695517774344e-06, "loss": 5.0198, "num_input_tokens_seen": 56064, "step": 170 }, { "epoch": 0.13523956723338484, "grad_norm": 4.164772033691406, "learning_rate": 3.3616692426584237e-06, "loss": 4.8427, "num_input_tokens_seen": 57952, "step": 175 }, { "epoch": 0.1391035548686244, "grad_norm": 5.398113250732422, "learning_rate": 3.458268933539413e-06, "loss": 4.6521, "num_input_tokens_seen": 59520, "step": 180 }, { "epoch": 0.14296754250386398, "grad_norm": 3.6066384315490723, "learning_rate": 3.554868624420402e-06, "loss": 4.8594, "num_input_tokens_seen": 61024, "step": 185 }, { "epoch": 0.14683153013910355, "grad_norm": 3.3785269260406494, "learning_rate": 3.6514683153013913e-06, "loss": 4.5004, "num_input_tokens_seen": 62880, "step": 190 }, { "epoch": 0.15069551777434312, "grad_norm": 3.9826133251190186, "learning_rate": 3.7480680061823805e-06, "loss": 4.9151, "num_input_tokens_seen": 64512, "step": 195 }, { "epoch": 0.1545595054095827, "grad_norm": 4.409061431884766, "learning_rate": 3.844667697063369e-06, "loss": 5.0913, "num_input_tokens_seen": 66080, "step": 200 }, { "epoch": 0.15842349304482226, "grad_norm": 4.311811923980713, "learning_rate": 3.941267387944358e-06, "loss": 4.7681, "num_input_tokens_seen": 67936, "step": 205 }, { "epoch": 0.16228748068006182, "grad_norm": 4.506394386291504, "learning_rate": 4.037867078825347e-06, "loss": 4.891, "num_input_tokens_seen": 69472, "step": 210 }, { "epoch": 0.1661514683153014, "grad_norm": 4.070672988891602, "learning_rate": 4.1344667697063366e-06, "loss": 5.1593, "num_input_tokens_seen": 71232, "step": 215 }, { "epoch": 0.17001545595054096, "grad_norm": 4.094906330108643, "learning_rate": 4.231066460587326e-06, "loss": 5.5008, "num_input_tokens_seen": 72800, "step": 220 }, { "epoch": 0.17387944358578053, "grad_norm": 6.265533924102783, "learning_rate": 4.327666151468315e-06, "loss": 5.0927, "num_input_tokens_seen": 74272, "step": 225 }, { "epoch": 0.1777434312210201, "grad_norm": 3.686335802078247, "learning_rate": 4.424265842349304e-06, "loss": 4.7141, "num_input_tokens_seen": 76032, "step": 230 }, { "epoch": 0.18160741885625967, "grad_norm": 6.304001331329346, "learning_rate": 4.5208655332302934e-06, "loss": 5.0485, "num_input_tokens_seen": 77760, "step": 235 }, { "epoch": 0.18547140649149924, "grad_norm": 4.006787300109863, "learning_rate": 4.617465224111283e-06, "loss": 5.1054, "num_input_tokens_seen": 79392, "step": 240 }, { "epoch": 0.1893353941267388, "grad_norm": 4.035526275634766, "learning_rate": 4.714064914992272e-06, "loss": 4.9636, "num_input_tokens_seen": 81216, "step": 245 }, { "epoch": 0.19319938176197837, "grad_norm": 4.4466328620910645, "learning_rate": 4.810664605873261e-06, "loss": 4.9559, "num_input_tokens_seen": 83072, "step": 250 }, { "epoch": 0.19706336939721794, "grad_norm": 4.01819372177124, "learning_rate": 4.90726429675425e-06, "loss": 4.5539, "num_input_tokens_seen": 84960, "step": 255 }, { "epoch": 0.2009273570324575, "grad_norm": 4.609978675842285, "learning_rate": 5.0038639876352395e-06, "loss": 5.1852, "num_input_tokens_seen": 86432, "step": 260 }, { "epoch": 0.20479134466769705, "grad_norm": 6.429018497467041, "learning_rate": 5.100463678516229e-06, "loss": 4.8773, "num_input_tokens_seen": 88192, "step": 265 }, { "epoch": 0.20865533230293662, "grad_norm": 4.30826997756958, "learning_rate": 5.197063369397218e-06, "loss": 5.0951, "num_input_tokens_seen": 89856, "step": 270 }, { "epoch": 0.2125193199381762, "grad_norm": 5.384740352630615, "learning_rate": 5.293663060278207e-06, "loss": 4.8981, "num_input_tokens_seen": 91808, "step": 275 }, { "epoch": 0.21638330757341576, "grad_norm": 4.626501560211182, "learning_rate": 5.390262751159196e-06, "loss": 4.877, "num_input_tokens_seen": 93312, "step": 280 }, { "epoch": 0.22024729520865532, "grad_norm": 4.053862571716309, "learning_rate": 5.486862442040186e-06, "loss": 4.9961, "num_input_tokens_seen": 95040, "step": 285 }, { "epoch": 0.2241112828438949, "grad_norm": 4.590604305267334, "learning_rate": 5.583462132921175e-06, "loss": 4.7198, "num_input_tokens_seen": 96672, "step": 290 }, { "epoch": 0.22797527047913446, "grad_norm": 3.4880518913269043, "learning_rate": 5.680061823802164e-06, "loss": 4.618, "num_input_tokens_seen": 98240, "step": 295 }, { "epoch": 0.23183925811437403, "grad_norm": 4.742989540100098, "learning_rate": 5.776661514683153e-06, "loss": 5.396, "num_input_tokens_seen": 99936, "step": 300 }, { "epoch": 0.2357032457496136, "grad_norm": 3.366258382797241, "learning_rate": 5.8732612055641425e-06, "loss": 4.911, "num_input_tokens_seen": 101760, "step": 305 }, { "epoch": 0.23956723338485317, "grad_norm": 4.355172634124756, "learning_rate": 5.969860896445132e-06, "loss": 5.0973, "num_input_tokens_seen": 103680, "step": 310 }, { "epoch": 0.24343122102009274, "grad_norm": 5.6212639808654785, "learning_rate": 6.066460587326121e-06, "loss": 4.8099, "num_input_tokens_seen": 105184, "step": 315 }, { "epoch": 0.2472952086553323, "grad_norm": 4.215370178222656, "learning_rate": 6.16306027820711e-06, "loss": 4.9718, "num_input_tokens_seen": 106816, "step": 320 }, { "epoch": 0.2511591962905719, "grad_norm": 4.7815680503845215, "learning_rate": 6.259659969088099e-06, "loss": 4.7639, "num_input_tokens_seen": 108352, "step": 325 }, { "epoch": 0.2550231839258114, "grad_norm": 3.927884817123413, "learning_rate": 6.356259659969089e-06, "loss": 4.6032, "num_input_tokens_seen": 109792, "step": 330 }, { "epoch": 0.258887171561051, "grad_norm": 4.374295234680176, "learning_rate": 6.452859350850078e-06, "loss": 5.0812, "num_input_tokens_seen": 111616, "step": 335 }, { "epoch": 0.26275115919629055, "grad_norm": 4.519144535064697, "learning_rate": 6.549459041731067e-06, "loss": 4.5283, "num_input_tokens_seen": 113344, "step": 340 }, { "epoch": 0.26661514683153015, "grad_norm": 3.9807240962982178, "learning_rate": 6.646058732612056e-06, "loss": 4.6526, "num_input_tokens_seen": 114912, "step": 345 }, { "epoch": 0.2704791344667697, "grad_norm": 4.884817600250244, "learning_rate": 6.7426584234930455e-06, "loss": 4.8316, "num_input_tokens_seen": 116736, "step": 350 }, { "epoch": 0.2743431221020093, "grad_norm": 4.390430450439453, "learning_rate": 6.839258114374035e-06, "loss": 4.9626, "num_input_tokens_seen": 118368, "step": 355 }, { "epoch": 0.2782071097372488, "grad_norm": 3.822059154510498, "learning_rate": 6.935857805255024e-06, "loss": 4.6833, "num_input_tokens_seen": 120416, "step": 360 }, { "epoch": 0.2820710973724884, "grad_norm": 3.2204782962799072, "learning_rate": 7.032457496136012e-06, "loss": 4.6186, "num_input_tokens_seen": 122016, "step": 365 }, { "epoch": 0.28593508500772796, "grad_norm": 4.45789098739624, "learning_rate": 7.129057187017002e-06, "loss": 5.0211, "num_input_tokens_seen": 123680, "step": 370 }, { "epoch": 0.28979907264296756, "grad_norm": 4.907557964324951, "learning_rate": 7.225656877897991e-06, "loss": 4.5639, "num_input_tokens_seen": 125312, "step": 375 }, { "epoch": 0.2936630602782071, "grad_norm": 3.5397143363952637, "learning_rate": 7.322256568778981e-06, "loss": 4.9994, "num_input_tokens_seen": 127456, "step": 380 }, { "epoch": 0.2975270479134467, "grad_norm": 6.440507888793945, "learning_rate": 7.418856259659969e-06, "loss": 5.0118, "num_input_tokens_seen": 129184, "step": 385 }, { "epoch": 0.30139103554868624, "grad_norm": 4.367912292480469, "learning_rate": 7.515455950540959e-06, "loss": 5.0595, "num_input_tokens_seen": 131008, "step": 390 }, { "epoch": 0.30525502318392583, "grad_norm": 4.043722152709961, "learning_rate": 7.612055641421948e-06, "loss": 4.8668, "num_input_tokens_seen": 132768, "step": 395 }, { "epoch": 0.3091190108191654, "grad_norm": 3.9456069469451904, "learning_rate": 7.708655332302938e-06, "loss": 4.5511, "num_input_tokens_seen": 134528, "step": 400 }, { "epoch": 0.31298299845440497, "grad_norm": 3.3666584491729736, "learning_rate": 7.805255023183925e-06, "loss": 5.0156, "num_input_tokens_seen": 136768, "step": 405 }, { "epoch": 0.3168469860896445, "grad_norm": 4.148830890655518, "learning_rate": 7.901854714064916e-06, "loss": 5.0858, "num_input_tokens_seen": 138272, "step": 410 }, { "epoch": 0.3207109737248841, "grad_norm": 5.846447467803955, "learning_rate": 7.998454404945904e-06, "loss": 4.7454, "num_input_tokens_seen": 139936, "step": 415 }, { "epoch": 0.32457496136012365, "grad_norm": 7.095427989959717, "learning_rate": 8.095054095826895e-06, "loss": 4.8967, "num_input_tokens_seen": 141632, "step": 420 }, { "epoch": 0.3284389489953632, "grad_norm": 4.242493152618408, "learning_rate": 8.191653786707882e-06, "loss": 4.9406, "num_input_tokens_seen": 143264, "step": 425 }, { "epoch": 0.3323029366306028, "grad_norm": 8.70592975616455, "learning_rate": 8.288253477588873e-06, "loss": 5.2297, "num_input_tokens_seen": 144896, "step": 430 }, { "epoch": 0.3361669242658423, "grad_norm": 5.0631208419799805, "learning_rate": 8.38485316846986e-06, "loss": 4.9196, "num_input_tokens_seen": 146816, "step": 435 }, { "epoch": 0.3400309119010819, "grad_norm": 7.075896739959717, "learning_rate": 8.481452859350851e-06, "loss": 5.0261, "num_input_tokens_seen": 148480, "step": 440 }, { "epoch": 0.34389489953632146, "grad_norm": 4.932891368865967, "learning_rate": 8.578052550231839e-06, "loss": 5.1717, "num_input_tokens_seen": 150112, "step": 445 }, { "epoch": 0.34775888717156106, "grad_norm": 3.784811496734619, "learning_rate": 8.67465224111283e-06, "loss": 5.0938, "num_input_tokens_seen": 151616, "step": 450 }, { "epoch": 0.3516228748068006, "grad_norm": 3.547077178955078, "learning_rate": 8.771251931993817e-06, "loss": 5.0017, "num_input_tokens_seen": 153056, "step": 455 }, { "epoch": 0.3554868624420402, "grad_norm": 4.75193977355957, "learning_rate": 8.867851622874808e-06, "loss": 4.5569, "num_input_tokens_seen": 154848, "step": 460 }, { "epoch": 0.35935085007727974, "grad_norm": 3.5024054050445557, "learning_rate": 8.964451313755796e-06, "loss": 4.5542, "num_input_tokens_seen": 156288, "step": 465 }, { "epoch": 0.36321483771251933, "grad_norm": 3.8062102794647217, "learning_rate": 9.061051004636785e-06, "loss": 4.7863, "num_input_tokens_seen": 157824, "step": 470 }, { "epoch": 0.3670788253477589, "grad_norm": 6.066873073577881, "learning_rate": 9.157650695517774e-06, "loss": 4.6034, "num_input_tokens_seen": 159616, "step": 475 }, { "epoch": 0.37094281298299847, "grad_norm": 4.586033344268799, "learning_rate": 9.254250386398764e-06, "loss": 4.5732, "num_input_tokens_seen": 161280, "step": 480 }, { "epoch": 0.374806800618238, "grad_norm": 3.9040396213531494, "learning_rate": 9.350850077279753e-06, "loss": 4.8532, "num_input_tokens_seen": 162880, "step": 485 }, { "epoch": 0.3786707882534776, "grad_norm": 3.374256134033203, "learning_rate": 9.447449768160742e-06, "loss": 4.4885, "num_input_tokens_seen": 164288, "step": 490 }, { "epoch": 0.38253477588871715, "grad_norm": 3.9441723823547363, "learning_rate": 9.544049459041731e-06, "loss": 4.8468, "num_input_tokens_seen": 165632, "step": 495 }, { "epoch": 0.38639876352395675, "grad_norm": 3.901989459991455, "learning_rate": 9.64064914992272e-06, "loss": 4.707, "num_input_tokens_seen": 167264, "step": 500 }, { "epoch": 0.3902627511591963, "grad_norm": 4.19573450088501, "learning_rate": 9.73724884080371e-06, "loss": 4.9003, "num_input_tokens_seen": 169024, "step": 505 }, { "epoch": 0.3941267387944359, "grad_norm": 3.549652576446533, "learning_rate": 9.833848531684699e-06, "loss": 4.3026, "num_input_tokens_seen": 170688, "step": 510 }, { "epoch": 0.3979907264296754, "grad_norm": 4.4814348220825195, "learning_rate": 9.930448222565688e-06, "loss": 4.3496, "num_input_tokens_seen": 172352, "step": 515 }, { "epoch": 0.401854714064915, "grad_norm": 4.326697826385498, "learning_rate": 1.0027047913446677e-05, "loss": 4.5472, "num_input_tokens_seen": 173920, "step": 520 }, { "epoch": 0.40571870170015456, "grad_norm": 3.680799722671509, "learning_rate": 1.0123647604327666e-05, "loss": 4.1258, "num_input_tokens_seen": 175552, "step": 525 }, { "epoch": 0.4095826893353941, "grad_norm": 3.3065130710601807, "learning_rate": 1.0220247295208656e-05, "loss": 4.1706, "num_input_tokens_seen": 177056, "step": 530 }, { "epoch": 0.4134466769706337, "grad_norm": 3.051771402359009, "learning_rate": 1.0316846986089645e-05, "loss": 4.2575, "num_input_tokens_seen": 178912, "step": 535 }, { "epoch": 0.41731066460587324, "grad_norm": 3.5465526580810547, "learning_rate": 1.0413446676970634e-05, "loss": 4.3261, "num_input_tokens_seen": 180480, "step": 540 }, { "epoch": 0.42117465224111283, "grad_norm": 3.4107913970947266, "learning_rate": 1.0510046367851623e-05, "loss": 3.8777, "num_input_tokens_seen": 182112, "step": 545 }, { "epoch": 0.4250386398763524, "grad_norm": 3.7396440505981445, "learning_rate": 1.0606646058732613e-05, "loss": 4.5715, "num_input_tokens_seen": 183808, "step": 550 }, { "epoch": 0.42890262751159197, "grad_norm": 3.5915520191192627, "learning_rate": 1.0703245749613602e-05, "loss": 3.6851, "num_input_tokens_seen": 185344, "step": 555 }, { "epoch": 0.4327666151468315, "grad_norm": 3.3401858806610107, "learning_rate": 1.0799845440494591e-05, "loss": 3.7346, "num_input_tokens_seen": 187232, "step": 560 }, { "epoch": 0.4366306027820711, "grad_norm": 3.358396053314209, "learning_rate": 1.089644513137558e-05, "loss": 3.6518, "num_input_tokens_seen": 188736, "step": 565 }, { "epoch": 0.44049459041731065, "grad_norm": 3.7031607627868652, "learning_rate": 1.099304482225657e-05, "loss": 3.6196, "num_input_tokens_seen": 190368, "step": 570 }, { "epoch": 0.44435857805255025, "grad_norm": 4.3245954513549805, "learning_rate": 1.1089644513137559e-05, "loss": 3.6151, "num_input_tokens_seen": 192032, "step": 575 }, { "epoch": 0.4482225656877898, "grad_norm": 3.289395570755005, "learning_rate": 1.1186244204018548e-05, "loss": 3.6529, "num_input_tokens_seen": 193728, "step": 580 }, { "epoch": 0.4520865533230294, "grad_norm": 3.1782970428466797, "learning_rate": 1.1282843894899537e-05, "loss": 3.7552, "num_input_tokens_seen": 195584, "step": 585 }, { "epoch": 0.4559505409582689, "grad_norm": 3.057476758956909, "learning_rate": 1.1379443585780526e-05, "loss": 3.8149, "num_input_tokens_seen": 197088, "step": 590 }, { "epoch": 0.4598145285935085, "grad_norm": 3.116137742996216, "learning_rate": 1.1476043276661516e-05, "loss": 3.3676, "num_input_tokens_seen": 198624, "step": 595 }, { "epoch": 0.46367851622874806, "grad_norm": 3.2917144298553467, "learning_rate": 1.1572642967542505e-05, "loss": 3.4984, "num_input_tokens_seen": 200384, "step": 600 }, { "epoch": 0.46754250386398766, "grad_norm": 4.168603420257568, "learning_rate": 1.1669242658423494e-05, "loss": 3.5105, "num_input_tokens_seen": 202048, "step": 605 }, { "epoch": 0.4714064914992272, "grad_norm": 3.4152989387512207, "learning_rate": 1.1765842349304483e-05, "loss": 3.232, "num_input_tokens_seen": 203648, "step": 610 }, { "epoch": 0.4752704791344668, "grad_norm": 3.109389305114746, "learning_rate": 1.1862442040185472e-05, "loss": 3.3288, "num_input_tokens_seen": 205312, "step": 615 }, { "epoch": 0.47913446676970634, "grad_norm": 2.726693630218506, "learning_rate": 1.1959041731066462e-05, "loss": 3.15, "num_input_tokens_seen": 207104, "step": 620 }, { "epoch": 0.48299845440494593, "grad_norm": 2.7685933113098145, "learning_rate": 1.2055641421947451e-05, "loss": 3.206, "num_input_tokens_seen": 208832, "step": 625 }, { "epoch": 0.4868624420401855, "grad_norm": 3.2974636554718018, "learning_rate": 1.215224111282844e-05, "loss": 3.1172, "num_input_tokens_seen": 210560, "step": 630 }, { "epoch": 0.490726429675425, "grad_norm": 2.980739116668701, "learning_rate": 1.224884080370943e-05, "loss": 3.068, "num_input_tokens_seen": 212096, "step": 635 }, { "epoch": 0.4945904173106646, "grad_norm": 3.6270532608032227, "learning_rate": 1.2345440494590419e-05, "loss": 2.9492, "num_input_tokens_seen": 213728, "step": 640 }, { "epoch": 0.49845440494590415, "grad_norm": 2.770662546157837, "learning_rate": 1.2442040185471408e-05, "loss": 3.1935, "num_input_tokens_seen": 215840, "step": 645 }, { "epoch": 0.5023183925811437, "grad_norm": 3.234313488006592, "learning_rate": 1.2538639876352395e-05, "loss": 3.0015, "num_input_tokens_seen": 217280, "step": 650 }, { "epoch": 0.5061823802163833, "grad_norm": 2.574077606201172, "learning_rate": 1.2635239567233386e-05, "loss": 2.7686, "num_input_tokens_seen": 219008, "step": 655 }, { "epoch": 0.5100463678516228, "grad_norm": 2.697012186050415, "learning_rate": 1.2731839258114375e-05, "loss": 2.8837, "num_input_tokens_seen": 220448, "step": 660 }, { "epoch": 0.5139103554868625, "grad_norm": 2.5477020740509033, "learning_rate": 1.2828438948995365e-05, "loss": 2.426, "num_input_tokens_seen": 222048, "step": 665 }, { "epoch": 0.517774343122102, "grad_norm": 3.026355743408203, "learning_rate": 1.2925038639876352e-05, "loss": 2.3684, "num_input_tokens_seen": 223616, "step": 670 }, { "epoch": 0.5216383307573416, "grad_norm": 2.4283580780029297, "learning_rate": 1.3021638330757341e-05, "loss": 2.8728, "num_input_tokens_seen": 225472, "step": 675 }, { "epoch": 0.5255023183925811, "grad_norm": 2.65849232673645, "learning_rate": 1.3118238021638332e-05, "loss": 2.6566, "num_input_tokens_seen": 227200, "step": 680 }, { "epoch": 0.5293663060278208, "grad_norm": 2.9257469177246094, "learning_rate": 1.3214837712519322e-05, "loss": 2.706, "num_input_tokens_seen": 228544, "step": 685 }, { "epoch": 0.5332302936630603, "grad_norm": 3.021756887435913, "learning_rate": 1.3311437403400309e-05, "loss": 2.4985, "num_input_tokens_seen": 230048, "step": 690 }, { "epoch": 0.5370942812982998, "grad_norm": 2.1354072093963623, "learning_rate": 1.3408037094281298e-05, "loss": 3.0153, "num_input_tokens_seen": 231744, "step": 695 }, { "epoch": 0.5409582689335394, "grad_norm": 2.920491933822632, "learning_rate": 1.350463678516229e-05, "loss": 2.659, "num_input_tokens_seen": 233472, "step": 700 }, { "epoch": 0.544822256568779, "grad_norm": 2.653836488723755, "learning_rate": 1.3601236476043278e-05, "loss": 2.3389, "num_input_tokens_seen": 235136, "step": 705 }, { "epoch": 0.5486862442040186, "grad_norm": 2.562244176864624, "learning_rate": 1.3697836166924266e-05, "loss": 2.4016, "num_input_tokens_seen": 236928, "step": 710 }, { "epoch": 0.5525502318392581, "grad_norm": 2.167246103286743, "learning_rate": 1.3794435857805255e-05, "loss": 2.7683, "num_input_tokens_seen": 238528, "step": 715 }, { "epoch": 0.5564142194744977, "grad_norm": 2.9806935787200928, "learning_rate": 1.3891035548686246e-05, "loss": 2.6172, "num_input_tokens_seen": 240160, "step": 720 }, { "epoch": 0.5602782071097373, "grad_norm": 2.453310012817383, "learning_rate": 1.3987635239567235e-05, "loss": 2.7919, "num_input_tokens_seen": 241984, "step": 725 }, { "epoch": 0.5641421947449768, "grad_norm": 2.561342477798462, "learning_rate": 1.4084234930448223e-05, "loss": 2.4243, "num_input_tokens_seen": 244064, "step": 730 }, { "epoch": 0.5680061823802164, "grad_norm": 2.1314871311187744, "learning_rate": 1.4180834621329212e-05, "loss": 2.4353, "num_input_tokens_seen": 246016, "step": 735 }, { "epoch": 0.5718701700154559, "grad_norm": 2.8564963340759277, "learning_rate": 1.4277434312210203e-05, "loss": 2.4086, "num_input_tokens_seen": 247968, "step": 740 }, { "epoch": 0.5757341576506955, "grad_norm": 3.089653730392456, "learning_rate": 1.4374034003091192e-05, "loss": 2.4562, "num_input_tokens_seen": 249664, "step": 745 }, { "epoch": 0.5795981452859351, "grad_norm": 2.9772393703460693, "learning_rate": 1.447063369397218e-05, "loss": 2.3698, "num_input_tokens_seen": 251360, "step": 750 }, { "epoch": 0.5834621329211747, "grad_norm": 2.827953815460205, "learning_rate": 1.4567233384853169e-05, "loss": 2.3248, "num_input_tokens_seen": 252928, "step": 755 }, { "epoch": 0.5873261205564142, "grad_norm": 2.4436135292053223, "learning_rate": 1.466383307573416e-05, "loss": 2.3173, "num_input_tokens_seen": 254528, "step": 760 }, { "epoch": 0.5911901081916537, "grad_norm": 2.1673953533172607, "learning_rate": 1.4760432766615149e-05, "loss": 2.2759, "num_input_tokens_seen": 256192, "step": 765 }, { "epoch": 0.5950540958268934, "grad_norm": 2.6707592010498047, "learning_rate": 1.4857032457496137e-05, "loss": 2.4738, "num_input_tokens_seen": 258240, "step": 770 }, { "epoch": 0.5989180834621329, "grad_norm": 2.6966936588287354, "learning_rate": 1.4953632148377126e-05, "loss": 2.4648, "num_input_tokens_seen": 260064, "step": 775 }, { "epoch": 0.6027820710973725, "grad_norm": 2.0628509521484375, "learning_rate": 1.5050231839258113e-05, "loss": 2.302, "num_input_tokens_seen": 261952, "step": 780 }, { "epoch": 0.606646058732612, "grad_norm": 2.7259469032287598, "learning_rate": 1.5146831530139106e-05, "loss": 2.7492, "num_input_tokens_seen": 263744, "step": 785 }, { "epoch": 0.6105100463678517, "grad_norm": 2.4641435146331787, "learning_rate": 1.5243431221020093e-05, "loss": 2.153, "num_input_tokens_seen": 265280, "step": 790 }, { "epoch": 0.6143740340030912, "grad_norm": 2.755915880203247, "learning_rate": 1.5340030911901083e-05, "loss": 2.3057, "num_input_tokens_seen": 266944, "step": 795 }, { "epoch": 0.6182380216383307, "grad_norm": 2.473018169403076, "learning_rate": 1.5436630602782072e-05, "loss": 2.3511, "num_input_tokens_seen": 268544, "step": 800 }, { "epoch": 0.6221020092735703, "grad_norm": 2.3060832023620605, "learning_rate": 1.553323029366306e-05, "loss": 2.2227, "num_input_tokens_seen": 270368, "step": 805 }, { "epoch": 0.6259659969088099, "grad_norm": 2.1077919006347656, "learning_rate": 1.562982998454405e-05, "loss": 2.4877, "num_input_tokens_seen": 272032, "step": 810 }, { "epoch": 0.6298299845440495, "grad_norm": 2.032588481903076, "learning_rate": 1.572642967542504e-05, "loss": 2.0279, "num_input_tokens_seen": 273824, "step": 815 }, { "epoch": 0.633693972179289, "grad_norm": 2.6539316177368164, "learning_rate": 1.582302936630603e-05, "loss": 2.4307, "num_input_tokens_seen": 275616, "step": 820 }, { "epoch": 0.6375579598145286, "grad_norm": 2.250471353530884, "learning_rate": 1.5919629057187018e-05, "loss": 2.0075, "num_input_tokens_seen": 277248, "step": 825 }, { "epoch": 0.6414219474497682, "grad_norm": 2.2076423168182373, "learning_rate": 1.6016228748068007e-05, "loss": 2.3122, "num_input_tokens_seen": 278816, "step": 830 }, { "epoch": 0.6452859350850078, "grad_norm": 2.02557373046875, "learning_rate": 1.6112828438948996e-05, "loss": 2.2101, "num_input_tokens_seen": 280320, "step": 835 }, { "epoch": 0.6491499227202473, "grad_norm": 2.627354621887207, "learning_rate": 1.6209428129829986e-05, "loss": 2.2427, "num_input_tokens_seen": 281824, "step": 840 }, { "epoch": 0.6530139103554868, "grad_norm": 2.2147161960601807, "learning_rate": 1.6306027820710975e-05, "loss": 2.1156, "num_input_tokens_seen": 283456, "step": 845 }, { "epoch": 0.6568778979907264, "grad_norm": 2.743056297302246, "learning_rate": 1.6402627511591964e-05, "loss": 2.2226, "num_input_tokens_seen": 285408, "step": 850 }, { "epoch": 0.660741885625966, "grad_norm": 2.0509793758392334, "learning_rate": 1.6499227202472953e-05, "loss": 2.0031, "num_input_tokens_seen": 286976, "step": 855 }, { "epoch": 0.6646058732612056, "grad_norm": 2.5913126468658447, "learning_rate": 1.6595826893353942e-05, "loss": 2.4167, "num_input_tokens_seen": 288512, "step": 860 }, { "epoch": 0.6684698608964451, "grad_norm": 1.7691892385482788, "learning_rate": 1.6692426584234932e-05, "loss": 1.8028, "num_input_tokens_seen": 290368, "step": 865 }, { "epoch": 0.6723338485316847, "grad_norm": 2.116285800933838, "learning_rate": 1.678902627511592e-05, "loss": 2.2919, "num_input_tokens_seen": 292288, "step": 870 }, { "epoch": 0.6761978361669243, "grad_norm": 1.9856599569320679, "learning_rate": 1.688562596599691e-05, "loss": 1.9699, "num_input_tokens_seen": 294176, "step": 875 }, { "epoch": 0.6800618238021638, "grad_norm": 2.5610709190368652, "learning_rate": 1.69822256568779e-05, "loss": 2.0947, "num_input_tokens_seen": 295936, "step": 880 }, { "epoch": 0.6839258114374034, "grad_norm": 2.237907648086548, "learning_rate": 1.7078825347758885e-05, "loss": 2.0075, "num_input_tokens_seen": 297568, "step": 885 }, { "epoch": 0.6877897990726429, "grad_norm": 2.2458689212799072, "learning_rate": 1.7175425038639878e-05, "loss": 1.722, "num_input_tokens_seen": 299136, "step": 890 }, { "epoch": 0.6916537867078826, "grad_norm": 2.422473669052124, "learning_rate": 1.7272024729520867e-05, "loss": 1.7683, "num_input_tokens_seen": 301024, "step": 895 }, { "epoch": 0.6955177743431221, "grad_norm": 2.0187201499938965, "learning_rate": 1.7368624420401856e-05, "loss": 2.0609, "num_input_tokens_seen": 302848, "step": 900 }, { "epoch": 0.6993817619783617, "grad_norm": 1.9057577848434448, "learning_rate": 1.7465224111282842e-05, "loss": 2.1089, "num_input_tokens_seen": 304512, "step": 905 }, { "epoch": 0.7032457496136012, "grad_norm": 1.848322868347168, "learning_rate": 1.7561823802163835e-05, "loss": 2.0982, "num_input_tokens_seen": 306016, "step": 910 }, { "epoch": 0.7071097372488409, "grad_norm": 1.870103120803833, "learning_rate": 1.7658423493044824e-05, "loss": 1.9035, "num_input_tokens_seen": 307456, "step": 915 }, { "epoch": 0.7109737248840804, "grad_norm": 2.062422275543213, "learning_rate": 1.7755023183925813e-05, "loss": 1.7997, "num_input_tokens_seen": 309312, "step": 920 }, { "epoch": 0.7148377125193199, "grad_norm": 2.2753899097442627, "learning_rate": 1.78516228748068e-05, "loss": 2.0267, "num_input_tokens_seen": 310976, "step": 925 }, { "epoch": 0.7187017001545595, "grad_norm": 1.879282832145691, "learning_rate": 1.794822256568779e-05, "loss": 1.8256, "num_input_tokens_seen": 312704, "step": 930 }, { "epoch": 0.7225656877897991, "grad_norm": 1.7786118984222412, "learning_rate": 1.804482225656878e-05, "loss": 1.737, "num_input_tokens_seen": 314496, "step": 935 }, { "epoch": 0.7264296754250387, "grad_norm": 2.434251308441162, "learning_rate": 1.814142194744977e-05, "loss": 1.7582, "num_input_tokens_seen": 316192, "step": 940 }, { "epoch": 0.7302936630602782, "grad_norm": 1.794087290763855, "learning_rate": 1.8238021638330756e-05, "loss": 1.7727, "num_input_tokens_seen": 317600, "step": 945 }, { "epoch": 0.7341576506955177, "grad_norm": 1.9403486251831055, "learning_rate": 1.833462132921175e-05, "loss": 1.6852, "num_input_tokens_seen": 319136, "step": 950 }, { "epoch": 0.7380216383307573, "grad_norm": 1.8248711824417114, "learning_rate": 1.8431221020092738e-05, "loss": 1.9855, "num_input_tokens_seen": 320896, "step": 955 }, { "epoch": 0.7418856259659969, "grad_norm": 1.4804762601852417, "learning_rate": 1.8527820710973727e-05, "loss": 1.7102, "num_input_tokens_seen": 322720, "step": 960 }, { "epoch": 0.7457496136012365, "grad_norm": 1.5093111991882324, "learning_rate": 1.8624420401854713e-05, "loss": 1.4393, "num_input_tokens_seen": 324608, "step": 965 }, { "epoch": 0.749613601236476, "grad_norm": 1.6962155103683472, "learning_rate": 1.8721020092735705e-05, "loss": 1.6946, "num_input_tokens_seen": 326144, "step": 970 }, { "epoch": 0.7534775888717156, "grad_norm": 2.7902674674987793, "learning_rate": 1.8817619783616695e-05, "loss": 1.6773, "num_input_tokens_seen": 328064, "step": 975 }, { "epoch": 0.7573415765069552, "grad_norm": 1.7222951650619507, "learning_rate": 1.8914219474497684e-05, "loss": 1.7357, "num_input_tokens_seen": 329824, "step": 980 }, { "epoch": 0.7612055641421948, "grad_norm": 1.786777377128601, "learning_rate": 1.901081916537867e-05, "loss": 1.7064, "num_input_tokens_seen": 331424, "step": 985 }, { "epoch": 0.7650695517774343, "grad_norm": 1.8012217283248901, "learning_rate": 1.910741885625966e-05, "loss": 1.7632, "num_input_tokens_seen": 332992, "step": 990 }, { "epoch": 0.7689335394126738, "grad_norm": 1.9429322481155396, "learning_rate": 1.920401854714065e-05, "loss": 1.6488, "num_input_tokens_seen": 334528, "step": 995 }, { "epoch": 0.7727975270479135, "grad_norm": 2.166829824447632, "learning_rate": 1.930061823802164e-05, "loss": 1.6369, "num_input_tokens_seen": 336160, "step": 1000 }, { "epoch": 0.776661514683153, "grad_norm": 1.9060707092285156, "learning_rate": 1.9397217928902626e-05, "loss": 1.4749, "num_input_tokens_seen": 337568, "step": 1005 }, { "epoch": 0.7805255023183926, "grad_norm": 1.9580352306365967, "learning_rate": 1.9493817619783616e-05, "loss": 1.5444, "num_input_tokens_seen": 339328, "step": 1010 }, { "epoch": 0.7843894899536321, "grad_norm": 1.0697630643844604, "learning_rate": 1.9590417310664608e-05, "loss": 1.2553, "num_input_tokens_seen": 341056, "step": 1015 }, { "epoch": 0.7882534775888718, "grad_norm": 1.6274774074554443, "learning_rate": 1.9687017001545598e-05, "loss": 1.8327, "num_input_tokens_seen": 342784, "step": 1020 }, { "epoch": 0.7921174652241113, "grad_norm": 1.5059616565704346, "learning_rate": 1.9783616692426583e-05, "loss": 1.6384, "num_input_tokens_seen": 344448, "step": 1025 }, { "epoch": 0.7959814528593508, "grad_norm": 1.3998714685440063, "learning_rate": 1.9880216383307573e-05, "loss": 1.5608, "num_input_tokens_seen": 346048, "step": 1030 }, { "epoch": 0.7998454404945904, "grad_norm": 1.9578293561935425, "learning_rate": 1.9976816074188565e-05, "loss": 1.3017, "num_input_tokens_seen": 347776, "step": 1035 }, { "epoch": 0.80370942812983, "grad_norm": 1.6103938817977905, "learning_rate": 2.0073415765069554e-05, "loss": 1.3572, "num_input_tokens_seen": 349440, "step": 1040 }, { "epoch": 0.8075734157650696, "grad_norm": 1.2800102233886719, "learning_rate": 2.017001545595054e-05, "loss": 1.4675, "num_input_tokens_seen": 351264, "step": 1045 }, { "epoch": 0.8114374034003091, "grad_norm": 1.597335696220398, "learning_rate": 2.026661514683153e-05, "loss": 1.75, "num_input_tokens_seen": 352800, "step": 1050 }, { "epoch": 0.8153013910355487, "grad_norm": 1.6149063110351562, "learning_rate": 2.0363214837712522e-05, "loss": 1.2935, "num_input_tokens_seen": 354176, "step": 1055 }, { "epoch": 0.8191653786707882, "grad_norm": 1.3387912511825562, "learning_rate": 2.045981452859351e-05, "loss": 1.4095, "num_input_tokens_seen": 355872, "step": 1060 }, { "epoch": 0.8230293663060279, "grad_norm": 1.0016297101974487, "learning_rate": 2.0556414219474497e-05, "loss": 1.3485, "num_input_tokens_seen": 357248, "step": 1065 }, { "epoch": 0.8268933539412674, "grad_norm": 1.3972282409667969, "learning_rate": 2.0653013910355486e-05, "loss": 1.3624, "num_input_tokens_seen": 358944, "step": 1070 }, { "epoch": 0.8307573415765069, "grad_norm": 1.226528525352478, "learning_rate": 2.074961360123648e-05, "loss": 1.5639, "num_input_tokens_seen": 360480, "step": 1075 }, { "epoch": 0.8346213292117465, "grad_norm": 1.6709729433059692, "learning_rate": 2.0846213292117468e-05, "loss": 1.3735, "num_input_tokens_seen": 362176, "step": 1080 }, { "epoch": 0.8384853168469861, "grad_norm": 1.6426457166671753, "learning_rate": 2.0942812982998454e-05, "loss": 1.3943, "num_input_tokens_seen": 363776, "step": 1085 }, { "epoch": 0.8423493044822257, "grad_norm": 1.2450088262557983, "learning_rate": 2.1039412673879443e-05, "loss": 1.5798, "num_input_tokens_seen": 365376, "step": 1090 }, { "epoch": 0.8462132921174652, "grad_norm": 1.3599146604537964, "learning_rate": 2.1136012364760432e-05, "loss": 1.2571, "num_input_tokens_seen": 366880, "step": 1095 }, { "epoch": 0.8500772797527048, "grad_norm": 1.5961339473724365, "learning_rate": 2.1232612055641425e-05, "loss": 1.5045, "num_input_tokens_seen": 368768, "step": 1100 }, { "epoch": 0.8539412673879444, "grad_norm": 1.4076753854751587, "learning_rate": 2.132921174652241e-05, "loss": 1.2204, "num_input_tokens_seen": 370560, "step": 1105 }, { "epoch": 0.8578052550231839, "grad_norm": 1.1057460308074951, "learning_rate": 2.14258114374034e-05, "loss": 1.2157, "num_input_tokens_seen": 372288, "step": 1110 }, { "epoch": 0.8616692426584235, "grad_norm": 1.1548107862472534, "learning_rate": 2.152241112828439e-05, "loss": 1.1433, "num_input_tokens_seen": 373792, "step": 1115 }, { "epoch": 0.865533230293663, "grad_norm": 1.3454567193984985, "learning_rate": 2.1619010819165382e-05, "loss": 1.5597, "num_input_tokens_seen": 375712, "step": 1120 }, { "epoch": 0.8693972179289027, "grad_norm": 1.216407299041748, "learning_rate": 2.1715610510046368e-05, "loss": 1.096, "num_input_tokens_seen": 377408, "step": 1125 }, { "epoch": 0.8732612055641422, "grad_norm": 1.2331002950668335, "learning_rate": 2.1812210200927357e-05, "loss": 1.48, "num_input_tokens_seen": 379328, "step": 1130 }, { "epoch": 0.8771251931993818, "grad_norm": 1.1137765645980835, "learning_rate": 2.1908809891808346e-05, "loss": 1.4072, "num_input_tokens_seen": 380864, "step": 1135 }, { "epoch": 0.8809891808346213, "grad_norm": 1.5340079069137573, "learning_rate": 2.200540958268934e-05, "loss": 1.4567, "num_input_tokens_seen": 382752, "step": 1140 }, { "epoch": 0.884853168469861, "grad_norm": 1.2806669473648071, "learning_rate": 2.2102009273570325e-05, "loss": 1.4774, "num_input_tokens_seen": 384320, "step": 1145 }, { "epoch": 0.8887171561051005, "grad_norm": 1.0417548418045044, "learning_rate": 2.2198608964451314e-05, "loss": 1.1986, "num_input_tokens_seen": 385984, "step": 1150 }, { "epoch": 0.89258114374034, "grad_norm": 0.6041566729545593, "learning_rate": 2.2295208655332303e-05, "loss": 1.1137, "num_input_tokens_seen": 387424, "step": 1155 }, { "epoch": 0.8964451313755796, "grad_norm": 0.61876380443573, "learning_rate": 2.2391808346213296e-05, "loss": 1.2426, "num_input_tokens_seen": 389216, "step": 1160 }, { "epoch": 0.9003091190108191, "grad_norm": 1.1568127870559692, "learning_rate": 2.248840803709428e-05, "loss": 1.1242, "num_input_tokens_seen": 390912, "step": 1165 }, { "epoch": 0.9041731066460588, "grad_norm": 1.6726824045181274, "learning_rate": 2.258500772797527e-05, "loss": 1.0347, "num_input_tokens_seen": 392384, "step": 1170 }, { "epoch": 0.9080370942812983, "grad_norm": 0.8632403612136841, "learning_rate": 2.268160741885626e-05, "loss": 1.2761, "num_input_tokens_seen": 394016, "step": 1175 }, { "epoch": 0.9119010819165378, "grad_norm": 1.0996583700180054, "learning_rate": 2.2778207109737253e-05, "loss": 1.0869, "num_input_tokens_seen": 395808, "step": 1180 }, { "epoch": 0.9157650695517774, "grad_norm": 1.0638176202774048, "learning_rate": 2.287480680061824e-05, "loss": 0.9835, "num_input_tokens_seen": 397472, "step": 1185 }, { "epoch": 0.919629057187017, "grad_norm": 0.5698239803314209, "learning_rate": 2.2971406491499228e-05, "loss": 0.9642, "num_input_tokens_seen": 399168, "step": 1190 }, { "epoch": 0.9234930448222566, "grad_norm": 1.3052140474319458, "learning_rate": 2.3068006182380217e-05, "loss": 1.0516, "num_input_tokens_seen": 400928, "step": 1195 }, { "epoch": 0.9273570324574961, "grad_norm": 1.4698278903961182, "learning_rate": 2.3164605873261206e-05, "loss": 1.2286, "num_input_tokens_seen": 402816, "step": 1200 }, { "epoch": 0.9312210200927357, "grad_norm": 1.1328034400939941, "learning_rate": 2.3261205564142195e-05, "loss": 1.1039, "num_input_tokens_seen": 404608, "step": 1205 }, { "epoch": 0.9350850077279753, "grad_norm": 1.1608318090438843, "learning_rate": 2.3357805255023184e-05, "loss": 1.0172, "num_input_tokens_seen": 406240, "step": 1210 }, { "epoch": 0.9389489953632149, "grad_norm": 1.0156021118164062, "learning_rate": 2.3454404945904174e-05, "loss": 1.1857, "num_input_tokens_seen": 407776, "step": 1215 }, { "epoch": 0.9428129829984544, "grad_norm": 0.6597974300384521, "learning_rate": 2.3551004636785163e-05, "loss": 1.0149, "num_input_tokens_seen": 409600, "step": 1220 }, { "epoch": 0.9466769706336939, "grad_norm": 1.4089607000350952, "learning_rate": 2.3647604327666152e-05, "loss": 1.3442, "num_input_tokens_seen": 411200, "step": 1225 }, { "epoch": 0.9505409582689336, "grad_norm": 0.9699195623397827, "learning_rate": 2.374420401854714e-05, "loss": 1.1019, "num_input_tokens_seen": 413024, "step": 1230 }, { "epoch": 0.9544049459041731, "grad_norm": 0.742306113243103, "learning_rate": 2.384080370942813e-05, "loss": 1.2671, "num_input_tokens_seen": 414752, "step": 1235 }, { "epoch": 0.9582689335394127, "grad_norm": 1.2437924146652222, "learning_rate": 2.393740340030912e-05, "loss": 1.0669, "num_input_tokens_seen": 416064, "step": 1240 }, { "epoch": 0.9621329211746522, "grad_norm": 2.1092562675476074, "learning_rate": 2.403400309119011e-05, "loss": 1.2203, "num_input_tokens_seen": 418144, "step": 1245 }, { "epoch": 0.9659969088098919, "grad_norm": 0.5196625590324402, "learning_rate": 2.4130602782071098e-05, "loss": 1.2206, "num_input_tokens_seen": 419840, "step": 1250 }, { "epoch": 0.9698608964451314, "grad_norm": 0.7301934957504272, "learning_rate": 2.4227202472952087e-05, "loss": 0.9925, "num_input_tokens_seen": 421600, "step": 1255 }, { "epoch": 0.973724884080371, "grad_norm": 0.9703823328018188, "learning_rate": 2.4323802163833077e-05, "loss": 1.0337, "num_input_tokens_seen": 423136, "step": 1260 }, { "epoch": 0.9775888717156105, "grad_norm": 0.9893945455551147, "learning_rate": 2.4420401854714066e-05, "loss": 1.0453, "num_input_tokens_seen": 424960, "step": 1265 }, { "epoch": 0.98145285935085, "grad_norm": 1.0062501430511475, "learning_rate": 2.4517001545595055e-05, "loss": 1.0321, "num_input_tokens_seen": 426432, "step": 1270 }, { "epoch": 0.9853168469860897, "grad_norm": 1.0337120294570923, "learning_rate": 2.4613601236476044e-05, "loss": 1.1013, "num_input_tokens_seen": 428512, "step": 1275 }, { "epoch": 0.9891808346213292, "grad_norm": 1.5293585062026978, "learning_rate": 2.4710200927357034e-05, "loss": 1.3412, "num_input_tokens_seen": 430368, "step": 1280 }, { "epoch": 0.9930448222565688, "grad_norm": 0.9346896409988403, "learning_rate": 2.4806800618238023e-05, "loss": 1.0785, "num_input_tokens_seen": 431936, "step": 1285 }, { "epoch": 0.9969088098918083, "grad_norm": 1.1747796535491943, "learning_rate": 2.4903400309119012e-05, "loss": 1.3293, "num_input_tokens_seen": 433440, "step": 1290 }, { "epoch": 1.0, "eval_loss": 1.0925908088684082, "eval_runtime": 6.358, "eval_samples_per_second": 90.438, "eval_steps_per_second": 22.649, "num_input_tokens_seen": 434624, "step": 1294 }, { "epoch": 1.000772797527048, "grad_norm": 0.7617354393005371, "learning_rate": 2.5e-05, "loss": 1.0414, "num_input_tokens_seen": 434976, "step": 1295 }, { "epoch": 1.0046367851622875, "grad_norm": 1.01580011844635, "learning_rate": 2.509659969088099e-05, "loss": 0.9294, "num_input_tokens_seen": 436704, "step": 1300 }, { "epoch": 1.008500772797527, "grad_norm": 1.3258094787597656, "learning_rate": 2.519319938176198e-05, "loss": 1.2677, "num_input_tokens_seen": 438528, "step": 1305 }, { "epoch": 1.0123647604327666, "grad_norm": 1.2677472829818726, "learning_rate": 2.5289799072642965e-05, "loss": 1.1187, "num_input_tokens_seen": 440032, "step": 1310 }, { "epoch": 1.0162287480680061, "grad_norm": 0.8702294826507568, "learning_rate": 2.5386398763523955e-05, "loss": 1.1242, "num_input_tokens_seen": 441504, "step": 1315 }, { "epoch": 1.0200927357032457, "grad_norm": 1.1946455240249634, "learning_rate": 2.548299845440495e-05, "loss": 1.0978, "num_input_tokens_seen": 443232, "step": 1320 }, { "epoch": 1.0239567233384854, "grad_norm": 0.9569841027259827, "learning_rate": 2.5579598145285937e-05, "loss": 1.3338, "num_input_tokens_seen": 444896, "step": 1325 }, { "epoch": 1.027820710973725, "grad_norm": 0.9219399690628052, "learning_rate": 2.5676197836166926e-05, "loss": 1.0427, "num_input_tokens_seen": 446752, "step": 1330 }, { "epoch": 1.0316846986089645, "grad_norm": 0.8572629690170288, "learning_rate": 2.5772797527047915e-05, "loss": 1.0086, "num_input_tokens_seen": 448256, "step": 1335 }, { "epoch": 1.035548686244204, "grad_norm": 0.6115228533744812, "learning_rate": 2.5869397217928904e-05, "loss": 1.0643, "num_input_tokens_seen": 450080, "step": 1340 }, { "epoch": 1.0394126738794436, "grad_norm": 0.6750025153160095, "learning_rate": 2.5965996908809893e-05, "loss": 1.1318, "num_input_tokens_seen": 451456, "step": 1345 }, { "epoch": 1.0432766615146831, "grad_norm": 0.8592022061347961, "learning_rate": 2.606259659969088e-05, "loss": 0.9465, "num_input_tokens_seen": 452992, "step": 1350 }, { "epoch": 1.0471406491499227, "grad_norm": 0.9373641014099121, "learning_rate": 2.615919629057187e-05, "loss": 1.0915, "num_input_tokens_seen": 454688, "step": 1355 }, { "epoch": 1.0510046367851622, "grad_norm": 0.9919126629829407, "learning_rate": 2.6255795981452864e-05, "loss": 0.9148, "num_input_tokens_seen": 456288, "step": 1360 }, { "epoch": 1.054868624420402, "grad_norm": 1.1174346208572388, "learning_rate": 2.635239567233385e-05, "loss": 0.8939, "num_input_tokens_seen": 457728, "step": 1365 }, { "epoch": 1.0587326120556415, "grad_norm": 1.3723253011703491, "learning_rate": 2.644899536321484e-05, "loss": 0.9545, "num_input_tokens_seen": 459264, "step": 1370 }, { "epoch": 1.062596599690881, "grad_norm": 0.9815837144851685, "learning_rate": 2.654559505409583e-05, "loss": 1.2709, "num_input_tokens_seen": 460832, "step": 1375 }, { "epoch": 1.0664605873261206, "grad_norm": 0.5122582316398621, "learning_rate": 2.6642194744976818e-05, "loss": 0.8807, "num_input_tokens_seen": 462560, "step": 1380 }, { "epoch": 1.0703245749613601, "grad_norm": 0.9154901504516602, "learning_rate": 2.6738794435857807e-05, "loss": 1.0293, "num_input_tokens_seen": 464352, "step": 1385 }, { "epoch": 1.0741885625965997, "grad_norm": 1.0054690837860107, "learning_rate": 2.6835394126738793e-05, "loss": 0.9789, "num_input_tokens_seen": 466176, "step": 1390 }, { "epoch": 1.0780525502318392, "grad_norm": 1.223282814025879, "learning_rate": 2.6931993817619782e-05, "loss": 0.8063, "num_input_tokens_seen": 467744, "step": 1395 }, { "epoch": 1.0819165378670788, "grad_norm": 1.0591392517089844, "learning_rate": 2.702859350850077e-05, "loss": 0.8834, "num_input_tokens_seen": 469408, "step": 1400 }, { "epoch": 1.0857805255023183, "grad_norm": 1.0558182001113892, "learning_rate": 2.7125193199381764e-05, "loss": 0.9824, "num_input_tokens_seen": 470976, "step": 1405 }, { "epoch": 1.089644513137558, "grad_norm": 0.7237772941589355, "learning_rate": 2.7221792890262753e-05, "loss": 0.9296, "num_input_tokens_seen": 472448, "step": 1410 }, { "epoch": 1.0935085007727976, "grad_norm": 1.3949816226959229, "learning_rate": 2.7318392581143742e-05, "loss": 1.3214, "num_input_tokens_seen": 473920, "step": 1415 }, { "epoch": 1.0973724884080371, "grad_norm": 0.6018790602684021, "learning_rate": 2.741499227202473e-05, "loss": 0.8904, "num_input_tokens_seen": 475712, "step": 1420 }, { "epoch": 1.1012364760432767, "grad_norm": 0.7583760619163513, "learning_rate": 2.751159196290572e-05, "loss": 0.8922, "num_input_tokens_seen": 477280, "step": 1425 }, { "epoch": 1.1051004636785162, "grad_norm": 0.8080288171768188, "learning_rate": 2.7608191653786707e-05, "loss": 0.9641, "num_input_tokens_seen": 479168, "step": 1430 }, { "epoch": 1.1089644513137558, "grad_norm": 0.9548705816268921, "learning_rate": 2.7704791344667696e-05, "loss": 0.9563, "num_input_tokens_seen": 480672, "step": 1435 }, { "epoch": 1.1128284389489953, "grad_norm": 1.0227991342544556, "learning_rate": 2.7801391035548685e-05, "loss": 1.1448, "num_input_tokens_seen": 482368, "step": 1440 }, { "epoch": 1.1166924265842348, "grad_norm": 1.5591058731079102, "learning_rate": 2.7897990726429678e-05, "loss": 1.2409, "num_input_tokens_seen": 484064, "step": 1445 }, { "epoch": 1.1205564142194744, "grad_norm": 1.2942003011703491, "learning_rate": 2.7994590417310667e-05, "loss": 1.0844, "num_input_tokens_seen": 485856, "step": 1450 }, { "epoch": 1.1244204018547141, "grad_norm": 0.7008368372917175, "learning_rate": 2.8091190108191656e-05, "loss": 1.0782, "num_input_tokens_seen": 487680, "step": 1455 }, { "epoch": 1.1282843894899537, "grad_norm": 1.1277965307235718, "learning_rate": 2.8187789799072645e-05, "loss": 0.8212, "num_input_tokens_seen": 489312, "step": 1460 }, { "epoch": 1.1321483771251932, "grad_norm": 0.6528925895690918, "learning_rate": 2.8284389489953635e-05, "loss": 0.7724, "num_input_tokens_seen": 490912, "step": 1465 }, { "epoch": 1.1360123647604328, "grad_norm": 0.6824874877929688, "learning_rate": 2.838098918083462e-05, "loss": 1.1005, "num_input_tokens_seen": 492384, "step": 1470 }, { "epoch": 1.1398763523956723, "grad_norm": 0.8505102396011353, "learning_rate": 2.847758887171561e-05, "loss": 0.8409, "num_input_tokens_seen": 494080, "step": 1475 }, { "epoch": 1.1437403400309119, "grad_norm": 0.7250193357467651, "learning_rate": 2.85741885625966e-05, "loss": 0.8134, "num_input_tokens_seen": 495712, "step": 1480 }, { "epoch": 1.1476043276661514, "grad_norm": 0.8499575257301331, "learning_rate": 2.867078825347759e-05, "loss": 0.9393, "num_input_tokens_seen": 497344, "step": 1485 }, { "epoch": 1.1514683153013912, "grad_norm": 0.9785213470458984, "learning_rate": 2.876738794435858e-05, "loss": 0.9224, "num_input_tokens_seen": 498912, "step": 1490 }, { "epoch": 1.1553323029366307, "grad_norm": 1.1880661249160767, "learning_rate": 2.886398763523957e-05, "loss": 0.9998, "num_input_tokens_seen": 500768, "step": 1495 }, { "epoch": 1.1591962905718702, "grad_norm": 1.0143451690673828, "learning_rate": 2.896058732612056e-05, "loss": 1.1151, "num_input_tokens_seen": 502464, "step": 1500 }, { "epoch": 1.1630602782071098, "grad_norm": 1.5148160457611084, "learning_rate": 2.905718701700155e-05, "loss": 0.8165, "num_input_tokens_seen": 504256, "step": 1505 }, { "epoch": 1.1669242658423493, "grad_norm": 0.8548352718353271, "learning_rate": 2.9153786707882534e-05, "loss": 0.8908, "num_input_tokens_seen": 506304, "step": 1510 }, { "epoch": 1.1707882534775889, "grad_norm": 1.164196491241455, "learning_rate": 2.9250386398763523e-05, "loss": 0.9158, "num_input_tokens_seen": 507936, "step": 1515 }, { "epoch": 1.1746522411128284, "grad_norm": 0.784388542175293, "learning_rate": 2.9346986089644513e-05, "loss": 0.8147, "num_input_tokens_seen": 509696, "step": 1520 }, { "epoch": 1.178516228748068, "grad_norm": 0.66580730676651, "learning_rate": 2.9443585780525502e-05, "loss": 1.0163, "num_input_tokens_seen": 511584, "step": 1525 }, { "epoch": 1.1823802163833075, "grad_norm": 1.1403034925460815, "learning_rate": 2.9540185471406495e-05, "loss": 0.9863, "num_input_tokens_seen": 513248, "step": 1530 }, { "epoch": 1.1862442040185472, "grad_norm": 0.8018466830253601, "learning_rate": 2.9636785162287484e-05, "loss": 0.8358, "num_input_tokens_seen": 515072, "step": 1535 }, { "epoch": 1.1901081916537868, "grad_norm": 0.6965013146400452, "learning_rate": 2.9733384853168473e-05, "loss": 0.806, "num_input_tokens_seen": 516704, "step": 1540 }, { "epoch": 1.1939721792890263, "grad_norm": 0.7545727491378784, "learning_rate": 2.9829984544049462e-05, "loss": 0.7389, "num_input_tokens_seen": 518336, "step": 1545 }, { "epoch": 1.1978361669242659, "grad_norm": 0.5878342986106873, "learning_rate": 2.9926584234930448e-05, "loss": 0.7769, "num_input_tokens_seen": 519968, "step": 1550 }, { "epoch": 1.2017001545595054, "grad_norm": 0.9931756258010864, "learning_rate": 3.0023183925811437e-05, "loss": 1.0096, "num_input_tokens_seen": 521664, "step": 1555 }, { "epoch": 1.205564142194745, "grad_norm": 0.5784414410591125, "learning_rate": 3.0119783616692426e-05, "loss": 0.8149, "num_input_tokens_seen": 523456, "step": 1560 }, { "epoch": 1.2094281298299845, "grad_norm": 0.7703412771224976, "learning_rate": 3.0216383307573416e-05, "loss": 1.0084, "num_input_tokens_seen": 525056, "step": 1565 }, { "epoch": 1.213292117465224, "grad_norm": 0.719409167766571, "learning_rate": 3.0312982998454408e-05, "loss": 0.8118, "num_input_tokens_seen": 526592, "step": 1570 }, { "epoch": 1.2171561051004636, "grad_norm": 1.2114222049713135, "learning_rate": 3.0409582689335397e-05, "loss": 1.1104, "num_input_tokens_seen": 528288, "step": 1575 }, { "epoch": 1.2210200927357033, "grad_norm": 1.5464136600494385, "learning_rate": 3.0506182380216387e-05, "loss": 0.9531, "num_input_tokens_seen": 529888, "step": 1580 }, { "epoch": 1.2248840803709429, "grad_norm": 0.5869284272193909, "learning_rate": 3.060278207109737e-05, "loss": 0.6893, "num_input_tokens_seen": 531232, "step": 1585 }, { "epoch": 1.2287480680061824, "grad_norm": 0.6353481411933899, "learning_rate": 3.0699381761978365e-05, "loss": 1.0483, "num_input_tokens_seen": 532768, "step": 1590 }, { "epoch": 1.232612055641422, "grad_norm": 1.0008399486541748, "learning_rate": 3.079598145285935e-05, "loss": 0.9461, "num_input_tokens_seen": 534368, "step": 1595 }, { "epoch": 1.2364760432766615, "grad_norm": 1.3135558366775513, "learning_rate": 3.089258114374034e-05, "loss": 0.862, "num_input_tokens_seen": 536096, "step": 1600 }, { "epoch": 1.240340030911901, "grad_norm": 0.7117847204208374, "learning_rate": 3.098918083462133e-05, "loss": 1.0808, "num_input_tokens_seen": 537888, "step": 1605 }, { "epoch": 1.2442040185471406, "grad_norm": 0.5145630836486816, "learning_rate": 3.1085780525502315e-05, "loss": 0.8125, "num_input_tokens_seen": 539712, "step": 1610 }, { "epoch": 1.2480680061823803, "grad_norm": 1.0209296941757202, "learning_rate": 3.118238021638331e-05, "loss": 0.8147, "num_input_tokens_seen": 541408, "step": 1615 }, { "epoch": 1.2519319938176197, "grad_norm": 0.697058916091919, "learning_rate": 3.12789799072643e-05, "loss": 0.9589, "num_input_tokens_seen": 543168, "step": 1620 }, { "epoch": 1.2557959814528594, "grad_norm": 0.8794271945953369, "learning_rate": 3.1375579598145286e-05, "loss": 0.8714, "num_input_tokens_seen": 544800, "step": 1625 }, { "epoch": 1.259659969088099, "grad_norm": 0.7333638668060303, "learning_rate": 3.147217928902628e-05, "loss": 0.7682, "num_input_tokens_seen": 546336, "step": 1630 }, { "epoch": 1.2635239567233385, "grad_norm": 0.8301734924316406, "learning_rate": 3.1568778979907265e-05, "loss": 0.834, "num_input_tokens_seen": 548096, "step": 1635 }, { "epoch": 1.267387944358578, "grad_norm": 1.37346613407135, "learning_rate": 3.166537867078825e-05, "loss": 0.8946, "num_input_tokens_seen": 549856, "step": 1640 }, { "epoch": 1.2712519319938176, "grad_norm": 1.1356102228164673, "learning_rate": 3.176197836166924e-05, "loss": 0.8433, "num_input_tokens_seen": 551616, "step": 1645 }, { "epoch": 1.2751159196290571, "grad_norm": 0.6534898281097412, "learning_rate": 3.185857805255023e-05, "loss": 0.8389, "num_input_tokens_seen": 553312, "step": 1650 }, { "epoch": 1.2789799072642967, "grad_norm": 1.6240729093551636, "learning_rate": 3.195517774343122e-05, "loss": 1.1162, "num_input_tokens_seen": 555072, "step": 1655 }, { "epoch": 1.2828438948995364, "grad_norm": 1.486716389656067, "learning_rate": 3.2051777434312214e-05, "loss": 1.2173, "num_input_tokens_seen": 556800, "step": 1660 }, { "epoch": 1.286707882534776, "grad_norm": 0.7447462677955627, "learning_rate": 3.21483771251932e-05, "loss": 0.7094, "num_input_tokens_seen": 558368, "step": 1665 }, { "epoch": 1.2905718701700155, "grad_norm": 0.7639641165733337, "learning_rate": 3.224497681607419e-05, "loss": 0.9538, "num_input_tokens_seen": 560064, "step": 1670 }, { "epoch": 1.294435857805255, "grad_norm": 0.786544919013977, "learning_rate": 3.234157650695518e-05, "loss": 0.673, "num_input_tokens_seen": 561824, "step": 1675 }, { "epoch": 1.2982998454404946, "grad_norm": 0.7604990005493164, "learning_rate": 3.2438176197836164e-05, "loss": 0.7529, "num_input_tokens_seen": 563552, "step": 1680 }, { "epoch": 1.3021638330757341, "grad_norm": 0.941207230091095, "learning_rate": 3.253477588871716e-05, "loss": 0.9479, "num_input_tokens_seen": 565280, "step": 1685 }, { "epoch": 1.3060278207109737, "grad_norm": 1.6819368600845337, "learning_rate": 3.263137557959814e-05, "loss": 0.8423, "num_input_tokens_seen": 567232, "step": 1690 }, { "epoch": 1.3098918083462132, "grad_norm": 0.701032817363739, "learning_rate": 3.2727975270479135e-05, "loss": 0.8014, "num_input_tokens_seen": 568672, "step": 1695 }, { "epoch": 1.3137557959814528, "grad_norm": 0.9841341972351074, "learning_rate": 3.282457496136013e-05, "loss": 1.2023, "num_input_tokens_seen": 570368, "step": 1700 }, { "epoch": 1.3176197836166925, "grad_norm": 1.4647513628005981, "learning_rate": 3.2921174652241114e-05, "loss": 0.9672, "num_input_tokens_seen": 572128, "step": 1705 }, { "epoch": 1.321483771251932, "grad_norm": 0.7373002171516418, "learning_rate": 3.3017774343122106e-05, "loss": 0.831, "num_input_tokens_seen": 573760, "step": 1710 }, { "epoch": 1.3253477588871716, "grad_norm": 2.065319299697876, "learning_rate": 3.311437403400309e-05, "loss": 0.9866, "num_input_tokens_seen": 575552, "step": 1715 }, { "epoch": 1.3292117465224111, "grad_norm": 0.5286924242973328, "learning_rate": 3.321097372488408e-05, "loss": 0.7247, "num_input_tokens_seen": 577152, "step": 1720 }, { "epoch": 1.3330757341576507, "grad_norm": 0.6796011328697205, "learning_rate": 3.330757341576507e-05, "loss": 0.8574, "num_input_tokens_seen": 579168, "step": 1725 }, { "epoch": 1.3369397217928902, "grad_norm": 0.8097741007804871, "learning_rate": 3.3404173106646057e-05, "loss": 0.7421, "num_input_tokens_seen": 580928, "step": 1730 }, { "epoch": 1.3408037094281298, "grad_norm": 0.7332020401954651, "learning_rate": 3.350077279752705e-05, "loss": 0.7375, "num_input_tokens_seen": 582592, "step": 1735 }, { "epoch": 1.3446676970633695, "grad_norm": 0.8502457141876221, "learning_rate": 3.359737248840804e-05, "loss": 1.0766, "num_input_tokens_seen": 584192, "step": 1740 }, { "epoch": 1.3485316846986088, "grad_norm": 1.515325903892517, "learning_rate": 3.369397217928903e-05, "loss": 0.6962, "num_input_tokens_seen": 585824, "step": 1745 }, { "epoch": 1.3523956723338486, "grad_norm": 0.8190324306488037, "learning_rate": 3.379057187017002e-05, "loss": 0.7111, "num_input_tokens_seen": 587552, "step": 1750 }, { "epoch": 1.3562596599690881, "grad_norm": 0.8595080375671387, "learning_rate": 3.3887171561051006e-05, "loss": 0.7688, "num_input_tokens_seen": 589120, "step": 1755 }, { "epoch": 1.3601236476043277, "grad_norm": 0.9985796809196472, "learning_rate": 3.398377125193199e-05, "loss": 0.8157, "num_input_tokens_seen": 590816, "step": 1760 }, { "epoch": 1.3639876352395672, "grad_norm": 1.0523737668991089, "learning_rate": 3.4080370942812984e-05, "loss": 0.746, "num_input_tokens_seen": 592448, "step": 1765 }, { "epoch": 1.3678516228748068, "grad_norm": 0.7310399413108826, "learning_rate": 3.417697063369397e-05, "loss": 0.847, "num_input_tokens_seen": 594336, "step": 1770 }, { "epoch": 1.3717156105100463, "grad_norm": 0.750669538974762, "learning_rate": 3.427357032457496e-05, "loss": 0.7337, "num_input_tokens_seen": 596064, "step": 1775 }, { "epoch": 1.3755795981452859, "grad_norm": 1.3984320163726807, "learning_rate": 3.4370170015455955e-05, "loss": 0.7882, "num_input_tokens_seen": 598016, "step": 1780 }, { "epoch": 1.3794435857805256, "grad_norm": 0.717572033405304, "learning_rate": 3.446676970633694e-05, "loss": 0.955, "num_input_tokens_seen": 599744, "step": 1785 }, { "epoch": 1.383307573415765, "grad_norm": 0.5958539247512817, "learning_rate": 3.4563369397217934e-05, "loss": 0.9137, "num_input_tokens_seen": 601632, "step": 1790 }, { "epoch": 1.3871715610510047, "grad_norm": 0.8015969395637512, "learning_rate": 3.465996908809892e-05, "loss": 0.7815, "num_input_tokens_seen": 603520, "step": 1795 }, { "epoch": 1.3910355486862442, "grad_norm": 0.8637434244155884, "learning_rate": 3.4756568778979906e-05, "loss": 1.0923, "num_input_tokens_seen": 605120, "step": 1800 }, { "epoch": 1.3948995363214838, "grad_norm": 1.1423274278640747, "learning_rate": 3.48531684698609e-05, "loss": 0.8851, "num_input_tokens_seen": 606720, "step": 1805 }, { "epoch": 1.3987635239567233, "grad_norm": 0.9431897401809692, "learning_rate": 3.4949768160741884e-05, "loss": 0.73, "num_input_tokens_seen": 608576, "step": 1810 }, { "epoch": 1.4026275115919629, "grad_norm": 1.3000550270080566, "learning_rate": 3.504636785162288e-05, "loss": 0.7246, "num_input_tokens_seen": 610464, "step": 1815 }, { "epoch": 1.4064914992272024, "grad_norm": 0.5516097545623779, "learning_rate": 3.514296754250386e-05, "loss": 0.6378, "num_input_tokens_seen": 612128, "step": 1820 }, { "epoch": 1.410355486862442, "grad_norm": 1.2127314805984497, "learning_rate": 3.5239567233384855e-05, "loss": 0.7602, "num_input_tokens_seen": 613824, "step": 1825 }, { "epoch": 1.4142194744976817, "grad_norm": 0.7122685313224792, "learning_rate": 3.533616692426585e-05, "loss": 0.7374, "num_input_tokens_seen": 615808, "step": 1830 }, { "epoch": 1.4180834621329212, "grad_norm": 0.8208901286125183, "learning_rate": 3.5432766615146834e-05, "loss": 0.8918, "num_input_tokens_seen": 617440, "step": 1835 }, { "epoch": 1.4219474497681608, "grad_norm": 0.6221882700920105, "learning_rate": 3.552936630602782e-05, "loss": 0.7805, "num_input_tokens_seen": 619328, "step": 1840 }, { "epoch": 1.4258114374034003, "grad_norm": 0.8033891916275024, "learning_rate": 3.562596599690881e-05, "loss": 0.6901, "num_input_tokens_seen": 620864, "step": 1845 }, { "epoch": 1.4296754250386399, "grad_norm": 0.5942745804786682, "learning_rate": 3.57225656877898e-05, "loss": 0.8982, "num_input_tokens_seen": 622240, "step": 1850 }, { "epoch": 1.4335394126738794, "grad_norm": 0.8124967813491821, "learning_rate": 3.581916537867079e-05, "loss": 0.6388, "num_input_tokens_seen": 624000, "step": 1855 }, { "epoch": 1.437403400309119, "grad_norm": 1.9401965141296387, "learning_rate": 3.5915765069551776e-05, "loss": 0.6339, "num_input_tokens_seen": 625792, "step": 1860 }, { "epoch": 1.4412673879443587, "grad_norm": 0.8966189622879028, "learning_rate": 3.601236476043277e-05, "loss": 0.7679, "num_input_tokens_seen": 627296, "step": 1865 }, { "epoch": 1.445131375579598, "grad_norm": 0.6615520119667053, "learning_rate": 3.610896445131376e-05, "loss": 0.7047, "num_input_tokens_seen": 628800, "step": 1870 }, { "epoch": 1.4489953632148378, "grad_norm": 0.8674830794334412, "learning_rate": 3.620556414219475e-05, "loss": 1.1733, "num_input_tokens_seen": 630528, "step": 1875 }, { "epoch": 1.4528593508500773, "grad_norm": 0.7049899697303772, "learning_rate": 3.630216383307573e-05, "loss": 0.7183, "num_input_tokens_seen": 632224, "step": 1880 }, { "epoch": 1.4567233384853169, "grad_norm": 0.8071690797805786, "learning_rate": 3.6398763523956726e-05, "loss": 0.6257, "num_input_tokens_seen": 633952, "step": 1885 }, { "epoch": 1.4605873261205564, "grad_norm": 1.0074628591537476, "learning_rate": 3.649536321483771e-05, "loss": 1.088, "num_input_tokens_seen": 635488, "step": 1890 }, { "epoch": 1.464451313755796, "grad_norm": 0.5949680805206299, "learning_rate": 3.6591962905718704e-05, "loss": 0.6986, "num_input_tokens_seen": 637152, "step": 1895 }, { "epoch": 1.4683153013910355, "grad_norm": 0.7682070732116699, "learning_rate": 3.668856259659969e-05, "loss": 0.7958, "num_input_tokens_seen": 638752, "step": 1900 }, { "epoch": 1.472179289026275, "grad_norm": 1.2881170511245728, "learning_rate": 3.678516228748068e-05, "loss": 1.0875, "num_input_tokens_seen": 640480, "step": 1905 }, { "epoch": 1.4760432766615148, "grad_norm": 0.8220927715301514, "learning_rate": 3.6881761978361675e-05, "loss": 0.7172, "num_input_tokens_seen": 642272, "step": 1910 }, { "epoch": 1.4799072642967541, "grad_norm": 1.116184949874878, "learning_rate": 3.697836166924266e-05, "loss": 0.9503, "num_input_tokens_seen": 644032, "step": 1915 }, { "epoch": 1.4837712519319939, "grad_norm": 1.182327151298523, "learning_rate": 3.707496136012365e-05, "loss": 0.7848, "num_input_tokens_seen": 645792, "step": 1920 }, { "epoch": 1.4876352395672334, "grad_norm": 0.6956338286399841, "learning_rate": 3.717156105100464e-05, "loss": 0.9004, "num_input_tokens_seen": 647392, "step": 1925 }, { "epoch": 1.491499227202473, "grad_norm": 1.0985716581344604, "learning_rate": 3.7268160741885625e-05, "loss": 0.6651, "num_input_tokens_seen": 648928, "step": 1930 }, { "epoch": 1.4953632148377125, "grad_norm": 0.912192702293396, "learning_rate": 3.736476043276662e-05, "loss": 0.6708, "num_input_tokens_seen": 650880, "step": 1935 }, { "epoch": 1.499227202472952, "grad_norm": 0.781287670135498, "learning_rate": 3.7461360123647604e-05, "loss": 0.7229, "num_input_tokens_seen": 652576, "step": 1940 }, { "epoch": 1.5030911901081918, "grad_norm": 0.6835507750511169, "learning_rate": 3.755795981452859e-05, "loss": 0.5944, "num_input_tokens_seen": 654144, "step": 1945 }, { "epoch": 1.5069551777434311, "grad_norm": 0.8587813377380371, "learning_rate": 3.765455950540959e-05, "loss": 0.8289, "num_input_tokens_seen": 655872, "step": 1950 }, { "epoch": 1.510819165378671, "grad_norm": 0.6796333193778992, "learning_rate": 3.7751159196290575e-05, "loss": 0.7493, "num_input_tokens_seen": 657472, "step": 1955 }, { "epoch": 1.5146831530139102, "grad_norm": 1.3573548793792725, "learning_rate": 3.784775888717156e-05, "loss": 0.8517, "num_input_tokens_seen": 659040, "step": 1960 }, { "epoch": 1.51854714064915, "grad_norm": 0.8044635057449341, "learning_rate": 3.794435857805255e-05, "loss": 0.7258, "num_input_tokens_seen": 660896, "step": 1965 }, { "epoch": 1.5224111282843895, "grad_norm": 0.5785580277442932, "learning_rate": 3.804095826893354e-05, "loss": 0.6346, "num_input_tokens_seen": 662432, "step": 1970 }, { "epoch": 1.526275115919629, "grad_norm": 0.8312123417854309, "learning_rate": 3.813755795981453e-05, "loss": 0.8404, "num_input_tokens_seen": 663968, "step": 1975 }, { "epoch": 1.5301391035548686, "grad_norm": 0.5307157635688782, "learning_rate": 3.823415765069552e-05, "loss": 0.683, "num_input_tokens_seen": 665568, "step": 1980 }, { "epoch": 1.5340030911901081, "grad_norm": 0.6903960108757019, "learning_rate": 3.83307573415765e-05, "loss": 0.7199, "num_input_tokens_seen": 667360, "step": 1985 }, { "epoch": 1.537867078825348, "grad_norm": 0.640998125076294, "learning_rate": 3.84273570324575e-05, "loss": 0.6773, "num_input_tokens_seen": 668864, "step": 1990 }, { "epoch": 1.5417310664605872, "grad_norm": 1.2578482627868652, "learning_rate": 3.852395672333849e-05, "loss": 0.7846, "num_input_tokens_seen": 670976, "step": 1995 }, { "epoch": 1.545595054095827, "grad_norm": 1.0323303937911987, "learning_rate": 3.8620556414219474e-05, "loss": 0.7515, "num_input_tokens_seen": 672576, "step": 2000 }, { "epoch": 1.5494590417310663, "grad_norm": 1.31444251537323, "learning_rate": 3.871715610510047e-05, "loss": 1.0477, "num_input_tokens_seen": 674208, "step": 2005 }, { "epoch": 1.553323029366306, "grad_norm": 0.5604597330093384, "learning_rate": 3.881375579598145e-05, "loss": 0.5952, "num_input_tokens_seen": 675808, "step": 2010 }, { "epoch": 1.5571870170015456, "grad_norm": 0.7323945760726929, "learning_rate": 3.8910355486862445e-05, "loss": 0.5813, "num_input_tokens_seen": 677344, "step": 2015 }, { "epoch": 1.5610510046367851, "grad_norm": 0.9297330379486084, "learning_rate": 3.900695517774343e-05, "loss": 0.5838, "num_input_tokens_seen": 679296, "step": 2020 }, { "epoch": 1.5649149922720247, "grad_norm": 0.6750853657722473, "learning_rate": 3.910355486862442e-05, "loss": 0.7568, "num_input_tokens_seen": 681120, "step": 2025 }, { "epoch": 1.5687789799072642, "grad_norm": 2.394862174987793, "learning_rate": 3.920015455950541e-05, "loss": 0.6102, "num_input_tokens_seen": 682720, "step": 2030 }, { "epoch": 1.572642967542504, "grad_norm": 0.7384635210037231, "learning_rate": 3.92967542503864e-05, "loss": 0.5863, "num_input_tokens_seen": 684352, "step": 2035 }, { "epoch": 1.5765069551777433, "grad_norm": 1.0256986618041992, "learning_rate": 3.939335394126739e-05, "loss": 0.8352, "num_input_tokens_seen": 686208, "step": 2040 }, { "epoch": 1.580370942812983, "grad_norm": 0.8528231382369995, "learning_rate": 3.948995363214838e-05, "loss": 1.0941, "num_input_tokens_seen": 687680, "step": 2045 }, { "epoch": 1.5842349304482226, "grad_norm": 1.1752705574035645, "learning_rate": 3.9586553323029367e-05, "loss": 0.8446, "num_input_tokens_seen": 689440, "step": 2050 }, { "epoch": 1.5880989180834622, "grad_norm": 0.8569584488868713, "learning_rate": 3.968315301391036e-05, "loss": 0.6217, "num_input_tokens_seen": 691168, "step": 2055 }, { "epoch": 1.5919629057187017, "grad_norm": 0.7248162031173706, "learning_rate": 3.9779752704791345e-05, "loss": 0.6564, "num_input_tokens_seen": 692992, "step": 2060 }, { "epoch": 1.5958268933539412, "grad_norm": 1.247253179550171, "learning_rate": 3.987635239567233e-05, "loss": 0.704, "num_input_tokens_seen": 694560, "step": 2065 }, { "epoch": 1.599690880989181, "grad_norm": 0.5590627193450928, "learning_rate": 3.9972952086553323e-05, "loss": 0.5996, "num_input_tokens_seen": 696320, "step": 2070 }, { "epoch": 1.6035548686244203, "grad_norm": 0.9131113886833191, "learning_rate": 4.0069551777434316e-05, "loss": 0.6965, "num_input_tokens_seen": 697984, "step": 2075 }, { "epoch": 1.60741885625966, "grad_norm": 1.3589409589767456, "learning_rate": 4.01661514683153e-05, "loss": 0.8348, "num_input_tokens_seen": 699648, "step": 2080 }, { "epoch": 1.6112828438948994, "grad_norm": 0.5208396315574646, "learning_rate": 4.0262751159196294e-05, "loss": 0.6078, "num_input_tokens_seen": 701408, "step": 2085 }, { "epoch": 1.6151468315301392, "grad_norm": 0.9699952602386475, "learning_rate": 4.035935085007728e-05, "loss": 0.6212, "num_input_tokens_seen": 703136, "step": 2090 }, { "epoch": 1.6190108191653787, "grad_norm": 0.5439551472663879, "learning_rate": 4.045595054095827e-05, "loss": 0.6007, "num_input_tokens_seen": 704640, "step": 2095 }, { "epoch": 1.6228748068006182, "grad_norm": 0.7687732577323914, "learning_rate": 4.055255023183926e-05, "loss": 0.8047, "num_input_tokens_seen": 706432, "step": 2100 }, { "epoch": 1.6267387944358578, "grad_norm": 0.7396436333656311, "learning_rate": 4.0649149922720245e-05, "loss": 0.5919, "num_input_tokens_seen": 708064, "step": 2105 }, { "epoch": 1.6306027820710973, "grad_norm": 0.8226101994514465, "learning_rate": 4.074574961360124e-05, "loss": 0.6833, "num_input_tokens_seen": 709856, "step": 2110 }, { "epoch": 1.634466769706337, "grad_norm": 1.176469326019287, "learning_rate": 4.084234930448223e-05, "loss": 1.0608, "num_input_tokens_seen": 711552, "step": 2115 }, { "epoch": 1.6383307573415764, "grad_norm": 1.0629485845565796, "learning_rate": 4.0938948995363216e-05, "loss": 0.6702, "num_input_tokens_seen": 713216, "step": 2120 }, { "epoch": 1.6421947449768162, "grad_norm": 0.6896587014198303, "learning_rate": 4.103554868624421e-05, "loss": 0.7691, "num_input_tokens_seen": 714848, "step": 2125 }, { "epoch": 1.6460587326120555, "grad_norm": 0.7454641461372375, "learning_rate": 4.1132148377125194e-05, "loss": 0.7608, "num_input_tokens_seen": 716320, "step": 2130 }, { "epoch": 1.6499227202472952, "grad_norm": 1.0238568782806396, "learning_rate": 4.122874806800619e-05, "loss": 0.6493, "num_input_tokens_seen": 717952, "step": 2135 }, { "epoch": 1.6537867078825348, "grad_norm": 0.7152611017227173, "learning_rate": 4.132534775888717e-05, "loss": 0.5893, "num_input_tokens_seen": 719552, "step": 2140 }, { "epoch": 1.6576506955177743, "grad_norm": 1.6242997646331787, "learning_rate": 4.142194744976816e-05, "loss": 0.8498, "num_input_tokens_seen": 721248, "step": 2145 }, { "epoch": 1.6615146831530139, "grad_norm": 0.9102374315261841, "learning_rate": 4.151854714064915e-05, "loss": 0.6412, "num_input_tokens_seen": 723168, "step": 2150 }, { "epoch": 1.6653786707882534, "grad_norm": 0.8263702988624573, "learning_rate": 4.161514683153014e-05, "loss": 0.501, "num_input_tokens_seen": 724928, "step": 2155 }, { "epoch": 1.6692426584234932, "grad_norm": 1.2555872201919556, "learning_rate": 4.171174652241113e-05, "loss": 0.8656, "num_input_tokens_seen": 726560, "step": 2160 }, { "epoch": 1.6731066460587325, "grad_norm": 0.7003481984138489, "learning_rate": 4.180834621329212e-05, "loss": 0.6713, "num_input_tokens_seen": 728416, "step": 2165 }, { "epoch": 1.6769706336939723, "grad_norm": 0.681065559387207, "learning_rate": 4.190494590417311e-05, "loss": 0.8969, "num_input_tokens_seen": 729952, "step": 2170 }, { "epoch": 1.6808346213292118, "grad_norm": 0.6072214245796204, "learning_rate": 4.20015455950541e-05, "loss": 0.562, "num_input_tokens_seen": 731616, "step": 2175 }, { "epoch": 1.6846986089644513, "grad_norm": 0.9297857880592346, "learning_rate": 4.2098145285935086e-05, "loss": 0.7179, "num_input_tokens_seen": 733344, "step": 2180 }, { "epoch": 1.6885625965996909, "grad_norm": 0.501737654209137, "learning_rate": 4.219474497681607e-05, "loss": 0.6209, "num_input_tokens_seen": 735008, "step": 2185 }, { "epoch": 1.6924265842349304, "grad_norm": 0.6410394906997681, "learning_rate": 4.2291344667697065e-05, "loss": 0.4916, "num_input_tokens_seen": 736576, "step": 2190 }, { "epoch": 1.69629057187017, "grad_norm": 0.47106167674064636, "learning_rate": 4.238794435857805e-05, "loss": 0.6198, "num_input_tokens_seen": 738496, "step": 2195 }, { "epoch": 1.7001545595054095, "grad_norm": 0.8997061848640442, "learning_rate": 4.248454404945904e-05, "loss": 0.6574, "num_input_tokens_seen": 740032, "step": 2200 }, { "epoch": 1.7040185471406493, "grad_norm": 1.4319831132888794, "learning_rate": 4.2581143740340036e-05, "loss": 0.7729, "num_input_tokens_seen": 741536, "step": 2205 }, { "epoch": 1.7078825347758886, "grad_norm": 1.0114225149154663, "learning_rate": 4.267774343122102e-05, "loss": 0.6942, "num_input_tokens_seen": 743424, "step": 2210 }, { "epoch": 1.7117465224111283, "grad_norm": 0.7815228700637817, "learning_rate": 4.2774343122102014e-05, "loss": 0.7835, "num_input_tokens_seen": 745248, "step": 2215 }, { "epoch": 1.7156105100463679, "grad_norm": 0.6217542886734009, "learning_rate": 4.2870942812983e-05, "loss": 0.7046, "num_input_tokens_seen": 747008, "step": 2220 }, { "epoch": 1.7194744976816074, "grad_norm": 1.622069001197815, "learning_rate": 4.2967542503863986e-05, "loss": 0.7176, "num_input_tokens_seen": 748640, "step": 2225 }, { "epoch": 1.723338485316847, "grad_norm": 0.9915990829467773, "learning_rate": 4.306414219474498e-05, "loss": 0.6224, "num_input_tokens_seen": 750656, "step": 2230 }, { "epoch": 1.7272024729520865, "grad_norm": 0.5827153325080872, "learning_rate": 4.3160741885625964e-05, "loss": 0.789, "num_input_tokens_seen": 752096, "step": 2235 }, { "epoch": 1.7310664605873263, "grad_norm": 0.6093546152114868, "learning_rate": 4.325734157650696e-05, "loss": 0.7906, "num_input_tokens_seen": 753824, "step": 2240 }, { "epoch": 1.7349304482225656, "grad_norm": 0.6442155241966248, "learning_rate": 4.335394126738795e-05, "loss": 0.5503, "num_input_tokens_seen": 755680, "step": 2245 }, { "epoch": 1.7387944358578054, "grad_norm": 0.712822675704956, "learning_rate": 4.3450540958268935e-05, "loss": 0.5338, "num_input_tokens_seen": 757280, "step": 2250 }, { "epoch": 1.7426584234930447, "grad_norm": 0.8253329396247864, "learning_rate": 4.354714064914993e-05, "loss": 0.7345, "num_input_tokens_seen": 759040, "step": 2255 }, { "epoch": 1.7465224111282844, "grad_norm": 0.7335242629051208, "learning_rate": 4.3643740340030914e-05, "loss": 0.5652, "num_input_tokens_seen": 760352, "step": 2260 }, { "epoch": 1.750386398763524, "grad_norm": 0.5596067309379578, "learning_rate": 4.37403400309119e-05, "loss": 0.8214, "num_input_tokens_seen": 762080, "step": 2265 }, { "epoch": 1.7542503863987635, "grad_norm": 0.5894043445587158, "learning_rate": 4.383693972179289e-05, "loss": 0.632, "num_input_tokens_seen": 763840, "step": 2270 }, { "epoch": 1.758114374034003, "grad_norm": 1.013756275177002, "learning_rate": 4.393353941267388e-05, "loss": 0.7192, "num_input_tokens_seen": 765536, "step": 2275 }, { "epoch": 1.7619783616692426, "grad_norm": 0.6711904406547546, "learning_rate": 4.403013910355487e-05, "loss": 0.5283, "num_input_tokens_seen": 767104, "step": 2280 }, { "epoch": 1.7658423493044824, "grad_norm": 0.4724126160144806, "learning_rate": 4.412673879443586e-05, "loss": 0.6135, "num_input_tokens_seen": 768576, "step": 2285 }, { "epoch": 1.7697063369397217, "grad_norm": 1.1004294157028198, "learning_rate": 4.422333848531685e-05, "loss": 0.6056, "num_input_tokens_seen": 770048, "step": 2290 }, { "epoch": 1.7735703245749614, "grad_norm": 0.7312191724777222, "learning_rate": 4.431993817619784e-05, "loss": 0.6949, "num_input_tokens_seen": 771680, "step": 2295 }, { "epoch": 1.7774343122102008, "grad_norm": 0.4875451624393463, "learning_rate": 4.441653786707883e-05, "loss": 0.6315, "num_input_tokens_seen": 773312, "step": 2300 }, { "epoch": 1.7812982998454405, "grad_norm": 0.8489568829536438, "learning_rate": 4.451313755795981e-05, "loss": 0.8622, "num_input_tokens_seen": 775040, "step": 2305 }, { "epoch": 1.78516228748068, "grad_norm": 0.6089122891426086, "learning_rate": 4.4609737248840806e-05, "loss": 0.5423, "num_input_tokens_seen": 776768, "step": 2310 }, { "epoch": 1.7890262751159196, "grad_norm": 0.6091098189353943, "learning_rate": 4.470633693972179e-05, "loss": 0.7683, "num_input_tokens_seen": 778336, "step": 2315 }, { "epoch": 1.7928902627511591, "grad_norm": 0.6857431530952454, "learning_rate": 4.4802936630602784e-05, "loss": 0.5888, "num_input_tokens_seen": 780096, "step": 2320 }, { "epoch": 1.7967542503863987, "grad_norm": 0.7699395418167114, "learning_rate": 4.489953632148378e-05, "loss": 0.5522, "num_input_tokens_seen": 781728, "step": 2325 }, { "epoch": 1.8006182380216385, "grad_norm": 0.977664589881897, "learning_rate": 4.499613601236476e-05, "loss": 0.8194, "num_input_tokens_seen": 783168, "step": 2330 }, { "epoch": 1.8044822256568778, "grad_norm": 0.6420139670372009, "learning_rate": 4.5092735703245755e-05, "loss": 0.5316, "num_input_tokens_seen": 785056, "step": 2335 }, { "epoch": 1.8083462132921175, "grad_norm": 1.3396207094192505, "learning_rate": 4.518933539412674e-05, "loss": 0.9368, "num_input_tokens_seen": 786688, "step": 2340 }, { "epoch": 1.812210200927357, "grad_norm": 0.5982866883277893, "learning_rate": 4.528593508500773e-05, "loss": 0.5813, "num_input_tokens_seen": 788288, "step": 2345 }, { "epoch": 1.8160741885625966, "grad_norm": 0.6208470463752747, "learning_rate": 4.538253477588872e-05, "loss": 0.5622, "num_input_tokens_seen": 789952, "step": 2350 }, { "epoch": 1.8199381761978362, "grad_norm": 0.652672290802002, "learning_rate": 4.5479134466769706e-05, "loss": 0.8274, "num_input_tokens_seen": 791712, "step": 2355 }, { "epoch": 1.8238021638330757, "grad_norm": 1.1196138858795166, "learning_rate": 4.55757341576507e-05, "loss": 0.7782, "num_input_tokens_seen": 793344, "step": 2360 }, { "epoch": 1.8276661514683155, "grad_norm": 0.6701226830482483, "learning_rate": 4.5672333848531684e-05, "loss": 0.544, "num_input_tokens_seen": 795040, "step": 2365 }, { "epoch": 1.8315301391035548, "grad_norm": 1.6125117540359497, "learning_rate": 4.576893353941268e-05, "loss": 0.741, "num_input_tokens_seen": 796768, "step": 2370 }, { "epoch": 1.8353941267387945, "grad_norm": 1.0787913799285889, "learning_rate": 4.586553323029367e-05, "loss": 0.9342, "num_input_tokens_seen": 798528, "step": 2375 }, { "epoch": 1.8392581143740339, "grad_norm": 0.6988455653190613, "learning_rate": 4.5962132921174655e-05, "loss": 0.519, "num_input_tokens_seen": 800256, "step": 2380 }, { "epoch": 1.8431221020092736, "grad_norm": 1.3168835639953613, "learning_rate": 4.605873261205564e-05, "loss": 0.9378, "num_input_tokens_seen": 802016, "step": 2385 }, { "epoch": 1.8469860896445132, "grad_norm": 0.5075323581695557, "learning_rate": 4.6155332302936633e-05, "loss": 0.499, "num_input_tokens_seen": 803680, "step": 2390 }, { "epoch": 1.8508500772797527, "grad_norm": 0.9275404214859009, "learning_rate": 4.625193199381762e-05, "loss": 0.5356, "num_input_tokens_seen": 805312, "step": 2395 }, { "epoch": 1.8547140649149922, "grad_norm": 0.9113990068435669, "learning_rate": 4.634853168469861e-05, "loss": 0.6181, "num_input_tokens_seen": 806976, "step": 2400 }, { "epoch": 1.8585780525502318, "grad_norm": 0.7387285232543945, "learning_rate": 4.64451313755796e-05, "loss": 0.5397, "num_input_tokens_seen": 808288, "step": 2405 }, { "epoch": 1.8624420401854715, "grad_norm": 0.6509053111076355, "learning_rate": 4.654173106646059e-05, "loss": 0.811, "num_input_tokens_seen": 810016, "step": 2410 }, { "epoch": 1.8663060278207109, "grad_norm": 0.510462760925293, "learning_rate": 4.663833075734158e-05, "loss": 0.6806, "num_input_tokens_seen": 811648, "step": 2415 }, { "epoch": 1.8701700154559506, "grad_norm": 0.41505053639411926, "learning_rate": 4.673493044822257e-05, "loss": 0.495, "num_input_tokens_seen": 813504, "step": 2420 }, { "epoch": 1.87403400309119, "grad_norm": 0.719312846660614, "learning_rate": 4.6831530139103555e-05, "loss": 0.7905, "num_input_tokens_seen": 815136, "step": 2425 }, { "epoch": 1.8778979907264297, "grad_norm": 1.2930697202682495, "learning_rate": 4.692812982998455e-05, "loss": 0.8493, "num_input_tokens_seen": 816736, "step": 2430 }, { "epoch": 1.8817619783616693, "grad_norm": 0.46564576029777527, "learning_rate": 4.702472952086553e-05, "loss": 0.5384, "num_input_tokens_seen": 818336, "step": 2435 }, { "epoch": 1.8856259659969088, "grad_norm": 0.6727601885795593, "learning_rate": 4.7121329211746526e-05, "loss": 0.5985, "num_input_tokens_seen": 820000, "step": 2440 }, { "epoch": 1.8894899536321483, "grad_norm": 0.719415545463562, "learning_rate": 4.721792890262751e-05, "loss": 0.5334, "num_input_tokens_seen": 821504, "step": 2445 }, { "epoch": 1.8933539412673879, "grad_norm": 0.5879726409912109, "learning_rate": 4.73145285935085e-05, "loss": 0.5197, "num_input_tokens_seen": 823104, "step": 2450 }, { "epoch": 1.8972179289026276, "grad_norm": 0.9655633568763733, "learning_rate": 4.74111282843895e-05, "loss": 0.7266, "num_input_tokens_seen": 824480, "step": 2455 }, { "epoch": 1.901081916537867, "grad_norm": 0.8651092052459717, "learning_rate": 4.750772797527048e-05, "loss": 0.881, "num_input_tokens_seen": 826336, "step": 2460 }, { "epoch": 1.9049459041731067, "grad_norm": 1.4631222486495972, "learning_rate": 4.760432766615147e-05, "loss": 0.8207, "num_input_tokens_seen": 828032, "step": 2465 }, { "epoch": 1.9088098918083463, "grad_norm": 0.7509258985519409, "learning_rate": 4.770092735703246e-05, "loss": 0.609, "num_input_tokens_seen": 829728, "step": 2470 }, { "epoch": 1.9126738794435858, "grad_norm": 0.7540879845619202, "learning_rate": 4.779752704791345e-05, "loss": 0.5762, "num_input_tokens_seen": 831648, "step": 2475 }, { "epoch": 1.9165378670788253, "grad_norm": 0.47950440645217896, "learning_rate": 4.789412673879444e-05, "loss": 0.5746, "num_input_tokens_seen": 833504, "step": 2480 }, { "epoch": 1.9204018547140649, "grad_norm": 0.42816653847694397, "learning_rate": 4.7990726429675425e-05, "loss": 0.8795, "num_input_tokens_seen": 835424, "step": 2485 }, { "epoch": 1.9242658423493046, "grad_norm": 0.7701159119606018, "learning_rate": 4.808732612055641e-05, "loss": 0.7916, "num_input_tokens_seen": 837024, "step": 2490 }, { "epoch": 1.928129829984544, "grad_norm": 0.6747866868972778, "learning_rate": 4.818392581143741e-05, "loss": 0.6403, "num_input_tokens_seen": 839200, "step": 2495 }, { "epoch": 1.9319938176197837, "grad_norm": 1.2165838479995728, "learning_rate": 4.8280525502318396e-05, "loss": 0.468, "num_input_tokens_seen": 840704, "step": 2500 }, { "epoch": 1.935857805255023, "grad_norm": 0.4264359176158905, "learning_rate": 4.837712519319938e-05, "loss": 0.5381, "num_input_tokens_seen": 842560, "step": 2505 }, { "epoch": 1.9397217928902628, "grad_norm": 1.1095354557037354, "learning_rate": 4.8473724884080375e-05, "loss": 0.5742, "num_input_tokens_seen": 844384, "step": 2510 }, { "epoch": 1.9435857805255023, "grad_norm": 1.8850528001785278, "learning_rate": 4.857032457496136e-05, "loss": 0.7927, "num_input_tokens_seen": 846048, "step": 2515 }, { "epoch": 1.947449768160742, "grad_norm": 1.707824468612671, "learning_rate": 4.866692426584235e-05, "loss": 0.8316, "num_input_tokens_seen": 847680, "step": 2520 }, { "epoch": 1.9513137557959814, "grad_norm": 0.6285451054573059, "learning_rate": 4.876352395672334e-05, "loss": 0.5561, "num_input_tokens_seen": 849184, "step": 2525 }, { "epoch": 1.955177743431221, "grad_norm": 0.7713500261306763, "learning_rate": 4.8860123647604325e-05, "loss": 0.5082, "num_input_tokens_seen": 850560, "step": 2530 }, { "epoch": 1.9590417310664607, "grad_norm": 0.4966448247432709, "learning_rate": 4.8956723338485324e-05, "loss": 0.5068, "num_input_tokens_seen": 852096, "step": 2535 }, { "epoch": 1.9629057187017, "grad_norm": 1.0377403497695923, "learning_rate": 4.905332302936631e-05, "loss": 0.6178, "num_input_tokens_seen": 853568, "step": 2540 }, { "epoch": 1.9667697063369398, "grad_norm": 0.7405827641487122, "learning_rate": 4.9149922720247296e-05, "loss": 0.5472, "num_input_tokens_seen": 855104, "step": 2545 }, { "epoch": 1.9706336939721791, "grad_norm": 0.7624149918556213, "learning_rate": 4.924652241112829e-05, "loss": 0.6679, "num_input_tokens_seen": 856800, "step": 2550 }, { "epoch": 1.974497681607419, "grad_norm": 0.8275721073150635, "learning_rate": 4.9343122102009274e-05, "loss": 0.5765, "num_input_tokens_seen": 858528, "step": 2555 }, { "epoch": 1.9783616692426584, "grad_norm": 0.9622222781181335, "learning_rate": 4.943972179289027e-05, "loss": 0.5457, "num_input_tokens_seen": 860128, "step": 2560 }, { "epoch": 1.982225656877898, "grad_norm": 0.5277184247970581, "learning_rate": 4.953632148377125e-05, "loss": 0.7903, "num_input_tokens_seen": 861888, "step": 2565 }, { "epoch": 1.9860896445131375, "grad_norm": 0.9154358506202698, "learning_rate": 4.963292117465224e-05, "loss": 0.5184, "num_input_tokens_seen": 863360, "step": 2570 }, { "epoch": 1.989953632148377, "grad_norm": 1.063477873802185, "learning_rate": 4.972952086553323e-05, "loss": 0.6492, "num_input_tokens_seen": 864832, "step": 2575 }, { "epoch": 1.9938176197836168, "grad_norm": 0.8353647589683533, "learning_rate": 4.9826120556414224e-05, "loss": 0.6144, "num_input_tokens_seen": 866688, "step": 2580 }, { "epoch": 1.9976816074188561, "grad_norm": 0.6776650547981262, "learning_rate": 4.992272024729521e-05, "loss": 0.5139, "num_input_tokens_seen": 868384, "step": 2585 }, { "epoch": 2.0, "eval_loss": 0.6458402872085571, "eval_runtime": 6.3928, "eval_samples_per_second": 89.945, "eval_steps_per_second": 22.525, "num_input_tokens_seen": 869056, "step": 2588 }, { "epoch": 2.001545595054096, "grad_norm": 0.7567434310913086, "learning_rate": 4.9999999772597e-05, "loss": 0.5526, "num_input_tokens_seen": 869792, "step": 2590 }, { "epoch": 2.0054095826893352, "grad_norm": 0.7644327878952026, "learning_rate": 4.9999991813492344e-05, "loss": 0.5636, "num_input_tokens_seen": 871360, "step": 2595 }, { "epoch": 2.009273570324575, "grad_norm": 1.3297168016433716, "learning_rate": 4.999997248424169e-05, "loss": 0.5769, "num_input_tokens_seen": 873152, "step": 2600 }, { "epoch": 2.0131375579598147, "grad_norm": 0.839655876159668, "learning_rate": 4.9999941784853825e-05, "loss": 0.8806, "num_input_tokens_seen": 875008, "step": 2605 }, { "epoch": 2.017001545595054, "grad_norm": 0.4345882534980774, "learning_rate": 4.999989971534272e-05, "loss": 0.5481, "num_input_tokens_seen": 876512, "step": 2610 }, { "epoch": 2.020865533230294, "grad_norm": 0.6954939961433411, "learning_rate": 4.9999846275727515e-05, "loss": 0.6405, "num_input_tokens_seen": 878496, "step": 2615 }, { "epoch": 2.024729520865533, "grad_norm": 1.1561851501464844, "learning_rate": 4.99997814660325e-05, "loss": 0.4887, "num_input_tokens_seen": 880288, "step": 2620 }, { "epoch": 2.028593508500773, "grad_norm": 0.4096739590167999, "learning_rate": 4.999970528628716e-05, "loss": 0.7837, "num_input_tokens_seen": 882016, "step": 2625 }, { "epoch": 2.0324574961360122, "grad_norm": 0.5120589733123779, "learning_rate": 4.999961773652613e-05, "loss": 0.618, "num_input_tokens_seen": 883744, "step": 2630 }, { "epoch": 2.036321483771252, "grad_norm": 0.6878524422645569, "learning_rate": 4.999951881678924e-05, "loss": 0.5735, "num_input_tokens_seen": 885344, "step": 2635 }, { "epoch": 2.0401854714064913, "grad_norm": 1.59120512008667, "learning_rate": 4.9999408527121474e-05, "loss": 0.739, "num_input_tokens_seen": 886816, "step": 2640 }, { "epoch": 2.044049459041731, "grad_norm": 0.4762903153896332, "learning_rate": 4.9999286867573004e-05, "loss": 0.4966, "num_input_tokens_seen": 888448, "step": 2645 }, { "epoch": 2.047913446676971, "grad_norm": 0.4936424493789673, "learning_rate": 4.9999153838199144e-05, "loss": 0.5886, "num_input_tokens_seen": 889984, "step": 2650 }, { "epoch": 2.05177743431221, "grad_norm": 0.7660191655158997, "learning_rate": 4.999900943906041e-05, "loss": 0.9588, "num_input_tokens_seen": 891680, "step": 2655 }, { "epoch": 2.05564142194745, "grad_norm": 1.32345712184906, "learning_rate": 4.9998853670222454e-05, "loss": 0.7879, "num_input_tokens_seen": 893312, "step": 2660 }, { "epoch": 2.0595054095826892, "grad_norm": 1.05155611038208, "learning_rate": 4.999868653175616e-05, "loss": 0.9682, "num_input_tokens_seen": 895040, "step": 2665 }, { "epoch": 2.063369397217929, "grad_norm": 0.6883065700531006, "learning_rate": 4.99985080237375e-05, "loss": 0.6255, "num_input_tokens_seen": 896768, "step": 2670 }, { "epoch": 2.0672333848531683, "grad_norm": 0.9660115242004395, "learning_rate": 4.9998318146247694e-05, "loss": 0.5339, "num_input_tokens_seen": 898400, "step": 2675 }, { "epoch": 2.071097372488408, "grad_norm": 1.0439225435256958, "learning_rate": 4.9998116899373073e-05, "loss": 0.7956, "num_input_tokens_seen": 899744, "step": 2680 }, { "epoch": 2.0749613601236474, "grad_norm": 0.8421943783760071, "learning_rate": 4.999790428320519e-05, "loss": 0.5621, "num_input_tokens_seen": 901376, "step": 2685 }, { "epoch": 2.078825347758887, "grad_norm": 0.9894040822982788, "learning_rate": 4.9997680297840734e-05, "loss": 0.8057, "num_input_tokens_seen": 902880, "step": 2690 }, { "epoch": 2.082689335394127, "grad_norm": 0.6950976252555847, "learning_rate": 4.9997444943381566e-05, "loss": 0.6336, "num_input_tokens_seen": 904544, "step": 2695 }, { "epoch": 2.0865533230293662, "grad_norm": 0.45721614360809326, "learning_rate": 4.999719821993473e-05, "loss": 0.6682, "num_input_tokens_seen": 906496, "step": 2700 }, { "epoch": 2.090417310664606, "grad_norm": 0.6862127780914307, "learning_rate": 4.9996940127612444e-05, "loss": 0.4762, "num_input_tokens_seen": 908000, "step": 2705 }, { "epoch": 2.0942812982998453, "grad_norm": 0.46799594163894653, "learning_rate": 4.9996670666532096e-05, "loss": 0.4524, "num_input_tokens_seen": 909600, "step": 2710 }, { "epoch": 2.098145285935085, "grad_norm": 0.39387014508247375, "learning_rate": 4.999638983681622e-05, "loss": 0.6352, "num_input_tokens_seen": 911296, "step": 2715 }, { "epoch": 2.1020092735703244, "grad_norm": 0.45652860403060913, "learning_rate": 4.999609763859255e-05, "loss": 0.6564, "num_input_tokens_seen": 913120, "step": 2720 }, { "epoch": 2.105873261205564, "grad_norm": 0.7216565012931824, "learning_rate": 4.999579407199398e-05, "loss": 0.7351, "num_input_tokens_seen": 914752, "step": 2725 }, { "epoch": 2.109737248840804, "grad_norm": 0.722620964050293, "learning_rate": 4.9995479137158577e-05, "loss": 0.4779, "num_input_tokens_seen": 916224, "step": 2730 }, { "epoch": 2.1136012364760433, "grad_norm": 0.8284867405891418, "learning_rate": 4.9995152834229564e-05, "loss": 0.8665, "num_input_tokens_seen": 917984, "step": 2735 }, { "epoch": 2.117465224111283, "grad_norm": 0.8774116039276123, "learning_rate": 4.999481516335536e-05, "loss": 0.407, "num_input_tokens_seen": 919520, "step": 2740 }, { "epoch": 2.1213292117465223, "grad_norm": 1.0714284181594849, "learning_rate": 4.999446612468952e-05, "loss": 0.5898, "num_input_tokens_seen": 921184, "step": 2745 }, { "epoch": 2.125193199381762, "grad_norm": 0.621341347694397, "learning_rate": 4.9994105718390804e-05, "loss": 0.5437, "num_input_tokens_seen": 922688, "step": 2750 }, { "epoch": 2.1290571870170014, "grad_norm": 1.2152304649353027, "learning_rate": 4.9993733944623136e-05, "loss": 0.7612, "num_input_tokens_seen": 924640, "step": 2755 }, { "epoch": 2.132921174652241, "grad_norm": 0.45624759793281555, "learning_rate": 4.999335080355557e-05, "loss": 0.6733, "num_input_tokens_seen": 926560, "step": 2760 }, { "epoch": 2.1367851622874805, "grad_norm": 0.7852809429168701, "learning_rate": 4.9992956295362395e-05, "loss": 0.6504, "num_input_tokens_seen": 928064, "step": 2765 }, { "epoch": 2.1406491499227203, "grad_norm": 0.6402932405471802, "learning_rate": 4.9992550420223e-05, "loss": 0.6883, "num_input_tokens_seen": 929760, "step": 2770 }, { "epoch": 2.1445131375579596, "grad_norm": 0.9694505333900452, "learning_rate": 4.999213317832202e-05, "loss": 0.6711, "num_input_tokens_seen": 931552, "step": 2775 }, { "epoch": 2.1483771251931993, "grad_norm": 0.8154115676879883, "learning_rate": 4.999170456984918e-05, "loss": 0.5121, "num_input_tokens_seen": 933344, "step": 2780 }, { "epoch": 2.152241112828439, "grad_norm": 0.8052494525909424, "learning_rate": 4.999126459499945e-05, "loss": 0.7092, "num_input_tokens_seen": 934912, "step": 2785 }, { "epoch": 2.1561051004636784, "grad_norm": 0.9739042520523071, "learning_rate": 4.999081325397291e-05, "loss": 0.6309, "num_input_tokens_seen": 936512, "step": 2790 }, { "epoch": 2.159969088098918, "grad_norm": 0.8484548330307007, "learning_rate": 4.999035054697483e-05, "loss": 0.5199, "num_input_tokens_seen": 938112, "step": 2795 }, { "epoch": 2.1638330757341575, "grad_norm": 0.708020031452179, "learning_rate": 4.9989876474215666e-05, "loss": 0.5703, "num_input_tokens_seen": 940064, "step": 2800 }, { "epoch": 2.1676970633693973, "grad_norm": 0.4589965343475342, "learning_rate": 4.998939103591103e-05, "loss": 0.5707, "num_input_tokens_seen": 941920, "step": 2805 }, { "epoch": 2.1715610510046366, "grad_norm": 0.6860437393188477, "learning_rate": 4.998889423228168e-05, "loss": 0.6714, "num_input_tokens_seen": 943840, "step": 2810 }, { "epoch": 2.1754250386398764, "grad_norm": 0.6176663637161255, "learning_rate": 4.998838606355359e-05, "loss": 0.5027, "num_input_tokens_seen": 945312, "step": 2815 }, { "epoch": 2.179289026275116, "grad_norm": 0.8756307363510132, "learning_rate": 4.998786652995787e-05, "loss": 0.5638, "num_input_tokens_seen": 946912, "step": 2820 }, { "epoch": 2.1831530139103554, "grad_norm": 0.6318624019622803, "learning_rate": 4.99873356317308e-05, "loss": 0.4631, "num_input_tokens_seen": 948352, "step": 2825 }, { "epoch": 2.187017001545595, "grad_norm": 0.7667078971862793, "learning_rate": 4.9986793369113846e-05, "loss": 0.4205, "num_input_tokens_seen": 949792, "step": 2830 }, { "epoch": 2.1908809891808345, "grad_norm": 1.2432109117507935, "learning_rate": 4.9986239742353627e-05, "loss": 0.8576, "num_input_tokens_seen": 951584, "step": 2835 }, { "epoch": 2.1947449768160743, "grad_norm": 0.5868632793426514, "learning_rate": 4.998567475170193e-05, "loss": 0.5463, "num_input_tokens_seen": 953248, "step": 2840 }, { "epoch": 2.1986089644513136, "grad_norm": 0.5597259402275085, "learning_rate": 4.998509839741573e-05, "loss": 0.463, "num_input_tokens_seen": 954944, "step": 2845 }, { "epoch": 2.2024729520865534, "grad_norm": 0.796594500541687, "learning_rate": 4.998451067975714e-05, "loss": 0.4432, "num_input_tokens_seen": 956480, "step": 2850 }, { "epoch": 2.206336939721793, "grad_norm": 0.7900167107582092, "learning_rate": 4.998391159899348e-05, "loss": 0.4661, "num_input_tokens_seen": 957952, "step": 2855 }, { "epoch": 2.2102009273570324, "grad_norm": 0.5220023989677429, "learning_rate": 4.9983301155397195e-05, "loss": 0.8132, "num_input_tokens_seen": 959744, "step": 2860 }, { "epoch": 2.214064914992272, "grad_norm": 0.46018967032432556, "learning_rate": 4.998267934924593e-05, "loss": 0.4902, "num_input_tokens_seen": 961472, "step": 2865 }, { "epoch": 2.2179289026275115, "grad_norm": 0.5688225030899048, "learning_rate": 4.9982046180822475e-05, "loss": 0.5629, "num_input_tokens_seen": 963040, "step": 2870 }, { "epoch": 2.2217928902627513, "grad_norm": 1.056673288345337, "learning_rate": 4.9981401650414806e-05, "loss": 0.4768, "num_input_tokens_seen": 964896, "step": 2875 }, { "epoch": 2.2256568778979906, "grad_norm": 1.1413931846618652, "learning_rate": 4.998074575831606e-05, "loss": 0.8197, "num_input_tokens_seen": 966464, "step": 2880 }, { "epoch": 2.2295208655332304, "grad_norm": 0.5314350128173828, "learning_rate": 4.998007850482454e-05, "loss": 0.6064, "num_input_tokens_seen": 968480, "step": 2885 }, { "epoch": 2.2333848531684697, "grad_norm": 0.6819086074829102, "learning_rate": 4.997939989024372e-05, "loss": 0.685, "num_input_tokens_seen": 970304, "step": 2890 }, { "epoch": 2.2372488408037094, "grad_norm": 1.2902984619140625, "learning_rate": 4.9978709914882225e-05, "loss": 0.8468, "num_input_tokens_seen": 972064, "step": 2895 }, { "epoch": 2.2411128284389488, "grad_norm": 0.5623127818107605, "learning_rate": 4.997800857905387e-05, "loss": 0.5806, "num_input_tokens_seen": 973632, "step": 2900 }, { "epoch": 2.2449768160741885, "grad_norm": 1.0874032974243164, "learning_rate": 4.9977295883077634e-05, "loss": 0.8858, "num_input_tokens_seen": 975488, "step": 2905 }, { "epoch": 2.2488408037094283, "grad_norm": 0.7127791047096252, "learning_rate": 4.997657182727764e-05, "loss": 0.4643, "num_input_tokens_seen": 977312, "step": 2910 }, { "epoch": 2.2527047913446676, "grad_norm": 0.4747318923473358, "learning_rate": 4.997583641198321e-05, "loss": 0.5587, "num_input_tokens_seen": 978976, "step": 2915 }, { "epoch": 2.2565687789799074, "grad_norm": 1.440902829170227, "learning_rate": 4.997508963752879e-05, "loss": 0.5, "num_input_tokens_seen": 980544, "step": 2920 }, { "epoch": 2.2604327666151467, "grad_norm": 0.6545812487602234, "learning_rate": 4.9974331504254047e-05, "loss": 0.6373, "num_input_tokens_seen": 982432, "step": 2925 }, { "epoch": 2.2642967542503865, "grad_norm": 0.862421989440918, "learning_rate": 4.997356201250376e-05, "loss": 0.4668, "num_input_tokens_seen": 984384, "step": 2930 }, { "epoch": 2.2681607418856258, "grad_norm": 0.5672481656074524, "learning_rate": 4.997278116262792e-05, "loss": 0.5184, "num_input_tokens_seen": 986208, "step": 2935 }, { "epoch": 2.2720247295208655, "grad_norm": 0.5468183159828186, "learning_rate": 4.997198895498164e-05, "loss": 0.6329, "num_input_tokens_seen": 987808, "step": 2940 }, { "epoch": 2.2758887171561053, "grad_norm": 1.1017963886260986, "learning_rate": 4.997118538992524e-05, "loss": 1.0126, "num_input_tokens_seen": 989408, "step": 2945 }, { "epoch": 2.2797527047913446, "grad_norm": 1.0476722717285156, "learning_rate": 4.9970370467824174e-05, "loss": 0.8039, "num_input_tokens_seen": 991008, "step": 2950 }, { "epoch": 2.2836166924265844, "grad_norm": 0.7327314019203186, "learning_rate": 4.996954418904908e-05, "loss": 0.5703, "num_input_tokens_seen": 992800, "step": 2955 }, { "epoch": 2.2874806800618237, "grad_norm": 1.1952277421951294, "learning_rate": 4.9968706553975754e-05, "loss": 0.7582, "num_input_tokens_seen": 994272, "step": 2960 }, { "epoch": 2.2913446676970635, "grad_norm": 0.5015440583229065, "learning_rate": 4.996785756298514e-05, "loss": 0.4687, "num_input_tokens_seen": 995904, "step": 2965 }, { "epoch": 2.295208655332303, "grad_norm": 0.4578341543674469, "learning_rate": 4.996699721646339e-05, "loss": 0.5531, "num_input_tokens_seen": 997664, "step": 2970 }, { "epoch": 2.2990726429675425, "grad_norm": 1.17210853099823, "learning_rate": 4.99661255148018e-05, "loss": 0.5348, "num_input_tokens_seen": 999360, "step": 2975 }, { "epoch": 2.3029366306027823, "grad_norm": 0.9824397563934326, "learning_rate": 4.996524245839679e-05, "loss": 0.6693, "num_input_tokens_seen": 1001152, "step": 2980 }, { "epoch": 2.3068006182380216, "grad_norm": 0.5156043171882629, "learning_rate": 4.9964348047650004e-05, "loss": 0.518, "num_input_tokens_seen": 1002912, "step": 2985 }, { "epoch": 2.3106646058732614, "grad_norm": 0.6254473328590393, "learning_rate": 4.996344228296822e-05, "loss": 0.4402, "num_input_tokens_seen": 1004608, "step": 2990 }, { "epoch": 2.3145285935085007, "grad_norm": 0.6479689478874207, "learning_rate": 4.996252516476339e-05, "loss": 0.6588, "num_input_tokens_seen": 1006176, "step": 2995 }, { "epoch": 2.3183925811437405, "grad_norm": 0.6969127655029297, "learning_rate": 4.9961596693452615e-05, "loss": 0.5598, "num_input_tokens_seen": 1007648, "step": 3000 }, { "epoch": 2.32225656877898, "grad_norm": 1.2428017854690552, "learning_rate": 4.9960656869458176e-05, "loss": 0.4318, "num_input_tokens_seen": 1008960, "step": 3005 }, { "epoch": 2.3261205564142196, "grad_norm": 1.8418818712234497, "learning_rate": 4.995970569320752e-05, "loss": 0.5089, "num_input_tokens_seen": 1010560, "step": 3010 }, { "epoch": 2.329984544049459, "grad_norm": 0.555794894695282, "learning_rate": 4.995874316513322e-05, "loss": 0.5298, "num_input_tokens_seen": 1012032, "step": 3015 }, { "epoch": 2.3338485316846986, "grad_norm": 1.9546446800231934, "learning_rate": 4.995776928567306e-05, "loss": 0.654, "num_input_tokens_seen": 1013440, "step": 3020 }, { "epoch": 2.337712519319938, "grad_norm": 0.7914984226226807, "learning_rate": 4.995678405526997e-05, "loss": 0.4749, "num_input_tokens_seen": 1015008, "step": 3025 }, { "epoch": 2.3415765069551777, "grad_norm": 0.5873106122016907, "learning_rate": 4.995578747437203e-05, "loss": 0.5437, "num_input_tokens_seen": 1016544, "step": 3030 }, { "epoch": 2.3454404945904175, "grad_norm": 0.531721830368042, "learning_rate": 4.995477954343249e-05, "loss": 0.703, "num_input_tokens_seen": 1018080, "step": 3035 }, { "epoch": 2.349304482225657, "grad_norm": 0.4963063895702362, "learning_rate": 4.995376026290976e-05, "loss": 0.7038, "num_input_tokens_seen": 1019648, "step": 3040 }, { "epoch": 2.3531684698608966, "grad_norm": 0.5960531830787659, "learning_rate": 4.9952729633267425e-05, "loss": 0.4405, "num_input_tokens_seen": 1021152, "step": 3045 }, { "epoch": 2.357032457496136, "grad_norm": 0.8349996209144592, "learning_rate": 4.995168765497422e-05, "loss": 0.6092, "num_input_tokens_seen": 1023040, "step": 3050 }, { "epoch": 2.3608964451313756, "grad_norm": 1.595048427581787, "learning_rate": 4.995063432850403e-05, "loss": 0.9811, "num_input_tokens_seen": 1024832, "step": 3055 }, { "epoch": 2.364760432766615, "grad_norm": 1.043314814567566, "learning_rate": 4.9949569654335936e-05, "loss": 0.5609, "num_input_tokens_seen": 1026560, "step": 3060 }, { "epoch": 2.3686244204018547, "grad_norm": 1.8193224668502808, "learning_rate": 4.9948493632954144e-05, "loss": 0.9122, "num_input_tokens_seen": 1028448, "step": 3065 }, { "epoch": 2.3724884080370945, "grad_norm": 1.0578491687774658, "learning_rate": 4.994740626484803e-05, "loss": 0.5984, "num_input_tokens_seen": 1030016, "step": 3070 }, { "epoch": 2.376352395672334, "grad_norm": 0.5498404502868652, "learning_rate": 4.994630755051214e-05, "loss": 0.5438, "num_input_tokens_seen": 1031840, "step": 3075 }, { "epoch": 2.3802163833075736, "grad_norm": 0.6438480615615845, "learning_rate": 4.9945197490446194e-05, "loss": 0.7671, "num_input_tokens_seen": 1033632, "step": 3080 }, { "epoch": 2.384080370942813, "grad_norm": 0.6850314736366272, "learning_rate": 4.9944076085155024e-05, "loss": 0.4341, "num_input_tokens_seen": 1035200, "step": 3085 }, { "epoch": 2.3879443585780527, "grad_norm": 0.692349910736084, "learning_rate": 4.9942943335148674e-05, "loss": 0.6522, "num_input_tokens_seen": 1037088, "step": 3090 }, { "epoch": 2.391808346213292, "grad_norm": 0.8496260643005371, "learning_rate": 4.994179924094231e-05, "loss": 0.5938, "num_input_tokens_seen": 1039040, "step": 3095 }, { "epoch": 2.3956723338485317, "grad_norm": 0.8309586644172668, "learning_rate": 4.994064380305629e-05, "loss": 0.5084, "num_input_tokens_seen": 1040800, "step": 3100 }, { "epoch": 2.3995363214837715, "grad_norm": 1.0442888736724854, "learning_rate": 4.99394770220161e-05, "loss": 0.5993, "num_input_tokens_seen": 1042560, "step": 3105 }, { "epoch": 2.403400309119011, "grad_norm": 0.9322795867919922, "learning_rate": 4.99382988983524e-05, "loss": 0.6877, "num_input_tokens_seen": 1044224, "step": 3110 }, { "epoch": 2.4072642967542506, "grad_norm": 0.6494585871696472, "learning_rate": 4.993710943260102e-05, "loss": 0.4784, "num_input_tokens_seen": 1045760, "step": 3115 }, { "epoch": 2.41112828438949, "grad_norm": 0.5726670026779175, "learning_rate": 4.993590862530292e-05, "loss": 0.5407, "num_input_tokens_seen": 1047552, "step": 3120 }, { "epoch": 2.4149922720247297, "grad_norm": 0.8913784623146057, "learning_rate": 4.993469647700425e-05, "loss": 0.4975, "num_input_tokens_seen": 1049056, "step": 3125 }, { "epoch": 2.418856259659969, "grad_norm": 0.8732784986495972, "learning_rate": 4.993347298825629e-05, "loss": 0.5253, "num_input_tokens_seen": 1050688, "step": 3130 }, { "epoch": 2.4227202472952087, "grad_norm": 0.9251123666763306, "learning_rate": 4.993223815961549e-05, "loss": 0.5479, "num_input_tokens_seen": 1052288, "step": 3135 }, { "epoch": 2.426584234930448, "grad_norm": 0.9100888967514038, "learning_rate": 4.993099199164347e-05, "loss": 0.4578, "num_input_tokens_seen": 1053856, "step": 3140 }, { "epoch": 2.430448222565688, "grad_norm": 0.8087944984436035, "learning_rate": 4.992973448490698e-05, "loss": 0.7066, "num_input_tokens_seen": 1055584, "step": 3145 }, { "epoch": 2.434312210200927, "grad_norm": 0.9772184491157532, "learning_rate": 4.992846563997795e-05, "loss": 0.5876, "num_input_tokens_seen": 1057408, "step": 3150 }, { "epoch": 2.438176197836167, "grad_norm": 1.195980191230774, "learning_rate": 4.992718545743346e-05, "loss": 0.7171, "num_input_tokens_seen": 1059040, "step": 3155 }, { "epoch": 2.4420401854714067, "grad_norm": 0.6627739071846008, "learning_rate": 4.9925893937855726e-05, "loss": 0.5255, "num_input_tokens_seen": 1060864, "step": 3160 }, { "epoch": 2.445904173106646, "grad_norm": 0.7673016786575317, "learning_rate": 4.992459108183217e-05, "loss": 0.9482, "num_input_tokens_seen": 1062496, "step": 3165 }, { "epoch": 2.4497681607418857, "grad_norm": 1.467333197593689, "learning_rate": 4.9923276889955317e-05, "loss": 0.6209, "num_input_tokens_seen": 1064256, "step": 3170 }, { "epoch": 2.453632148377125, "grad_norm": 1.2827228307724, "learning_rate": 4.992195136282287e-05, "loss": 0.4763, "num_input_tokens_seen": 1065952, "step": 3175 }, { "epoch": 2.457496136012365, "grad_norm": 0.6949454545974731, "learning_rate": 4.99206145010377e-05, "loss": 0.519, "num_input_tokens_seen": 1067520, "step": 3180 }, { "epoch": 2.461360123647604, "grad_norm": 1.0481961965560913, "learning_rate": 4.9919266305207806e-05, "loss": 0.5459, "num_input_tokens_seen": 1069216, "step": 3185 }, { "epoch": 2.465224111282844, "grad_norm": 0.4766972064971924, "learning_rate": 4.9917906775946366e-05, "loss": 0.5541, "num_input_tokens_seen": 1070688, "step": 3190 }, { "epoch": 2.4690880989180837, "grad_norm": 0.6638417840003967, "learning_rate": 4.9916535913871685e-05, "loss": 0.6379, "num_input_tokens_seen": 1072288, "step": 3195 }, { "epoch": 2.472952086553323, "grad_norm": 0.41422492265701294, "learning_rate": 4.9915153719607266e-05, "loss": 0.4947, "num_input_tokens_seen": 1073984, "step": 3200 }, { "epoch": 2.4768160741885628, "grad_norm": 0.7062453031539917, "learning_rate": 4.991376019378172e-05, "loss": 0.6639, "num_input_tokens_seen": 1075776, "step": 3205 }, { "epoch": 2.480680061823802, "grad_norm": 0.6917634010314941, "learning_rate": 4.991235533702883e-05, "loss": 0.5186, "num_input_tokens_seen": 1077600, "step": 3210 }, { "epoch": 2.484544049459042, "grad_norm": 1.3653064966201782, "learning_rate": 4.991093914998754e-05, "loss": 0.4998, "num_input_tokens_seen": 1079360, "step": 3215 }, { "epoch": 2.488408037094281, "grad_norm": 0.7623108625411987, "learning_rate": 4.990951163330194e-05, "loss": 0.5339, "num_input_tokens_seen": 1081056, "step": 3220 }, { "epoch": 2.492272024729521, "grad_norm": 0.7102620005607605, "learning_rate": 4.990807278762127e-05, "loss": 0.5231, "num_input_tokens_seen": 1082816, "step": 3225 }, { "epoch": 2.4961360123647607, "grad_norm": 0.9887121915817261, "learning_rate": 4.990662261359993e-05, "loss": 0.4831, "num_input_tokens_seen": 1084544, "step": 3230 }, { "epoch": 2.5, "grad_norm": 1.3507399559020996, "learning_rate": 4.990516111189747e-05, "loss": 0.6851, "num_input_tokens_seen": 1086560, "step": 3235 }, { "epoch": 2.5038639876352393, "grad_norm": 0.3870268166065216, "learning_rate": 4.990368828317857e-05, "loss": 0.5602, "num_input_tokens_seen": 1088064, "step": 3240 }, { "epoch": 2.507727975270479, "grad_norm": 1.113446831703186, "learning_rate": 4.990220412811311e-05, "loss": 0.8084, "num_input_tokens_seen": 1089728, "step": 3245 }, { "epoch": 2.511591962905719, "grad_norm": 0.7764049768447876, "learning_rate": 4.990070864737608e-05, "loss": 0.425, "num_input_tokens_seen": 1091424, "step": 3250 }, { "epoch": 2.515455950540958, "grad_norm": 0.7448837161064148, "learning_rate": 4.989920184164763e-05, "loss": 0.6187, "num_input_tokens_seen": 1092960, "step": 3255 }, { "epoch": 2.519319938176198, "grad_norm": 0.4950990378856659, "learning_rate": 4.989768371161306e-05, "loss": 0.5597, "num_input_tokens_seen": 1094944, "step": 3260 }, { "epoch": 2.5231839258114372, "grad_norm": 1.2862695455551147, "learning_rate": 4.989615425796283e-05, "loss": 0.6299, "num_input_tokens_seen": 1096928, "step": 3265 }, { "epoch": 2.527047913446677, "grad_norm": 1.1394457817077637, "learning_rate": 4.989461348139256e-05, "loss": 0.8357, "num_input_tokens_seen": 1098624, "step": 3270 }, { "epoch": 2.5309119010819163, "grad_norm": 0.6639581322669983, "learning_rate": 4.9893061382602985e-05, "loss": 0.4765, "num_input_tokens_seen": 1100288, "step": 3275 }, { "epoch": 2.534775888717156, "grad_norm": 1.2966827154159546, "learning_rate": 4.9891497962300017e-05, "loss": 0.5266, "num_input_tokens_seen": 1101792, "step": 3280 }, { "epoch": 2.538639876352396, "grad_norm": 0.9795219898223877, "learning_rate": 4.98899232211947e-05, "loss": 0.6875, "num_input_tokens_seen": 1103680, "step": 3285 }, { "epoch": 2.542503863987635, "grad_norm": 0.4587763845920563, "learning_rate": 4.988833716000324e-05, "loss": 0.507, "num_input_tokens_seen": 1105312, "step": 3290 }, { "epoch": 2.546367851622875, "grad_norm": 0.5610541105270386, "learning_rate": 4.9886739779447e-05, "loss": 0.7285, "num_input_tokens_seen": 1106784, "step": 3295 }, { "epoch": 2.5502318392581143, "grad_norm": 1.0507104396820068, "learning_rate": 4.9885131080252454e-05, "loss": 0.4702, "num_input_tokens_seen": 1108352, "step": 3300 }, { "epoch": 2.554095826893354, "grad_norm": 0.9816800951957703, "learning_rate": 4.9883511063151274e-05, "loss": 0.4299, "num_input_tokens_seen": 1109984, "step": 3305 }, { "epoch": 2.5579598145285933, "grad_norm": 0.859944224357605, "learning_rate": 4.988187972888023e-05, "loss": 0.6633, "num_input_tokens_seen": 1111392, "step": 3310 }, { "epoch": 2.561823802163833, "grad_norm": 0.6887502074241638, "learning_rate": 4.988023707818129e-05, "loss": 0.4772, "num_input_tokens_seen": 1113312, "step": 3315 }, { "epoch": 2.565687789799073, "grad_norm": 0.5387053489685059, "learning_rate": 4.9878583111801506e-05, "loss": 0.6608, "num_input_tokens_seen": 1115200, "step": 3320 }, { "epoch": 2.569551777434312, "grad_norm": 0.653689980506897, "learning_rate": 4.987691783049314e-05, "loss": 0.4618, "num_input_tokens_seen": 1116928, "step": 3325 }, { "epoch": 2.573415765069552, "grad_norm": 0.7021830677986145, "learning_rate": 4.9875241235013566e-05, "loss": 0.5116, "num_input_tokens_seen": 1118720, "step": 3330 }, { "epoch": 2.5772797527047913, "grad_norm": 0.7028103470802307, "learning_rate": 4.98735533261253e-05, "loss": 0.4801, "num_input_tokens_seen": 1120608, "step": 3335 }, { "epoch": 2.581143740340031, "grad_norm": 1.1614089012145996, "learning_rate": 4.987185410459602e-05, "loss": 0.513, "num_input_tokens_seen": 1122272, "step": 3340 }, { "epoch": 2.5850077279752703, "grad_norm": 0.514863133430481, "learning_rate": 4.9870143571198545e-05, "loss": 0.4814, "num_input_tokens_seen": 1124000, "step": 3345 }, { "epoch": 2.58887171561051, "grad_norm": 0.7317607402801514, "learning_rate": 4.986842172671083e-05, "loss": 0.5232, "num_input_tokens_seen": 1125856, "step": 3350 }, { "epoch": 2.59273570324575, "grad_norm": 0.7239667177200317, "learning_rate": 4.9866688571915984e-05, "loss": 0.5756, "num_input_tokens_seen": 1127776, "step": 3355 }, { "epoch": 2.596599690880989, "grad_norm": 0.7093083262443542, "learning_rate": 4.986494410760225e-05, "loss": 0.8255, "num_input_tokens_seen": 1129504, "step": 3360 }, { "epoch": 2.6004636785162285, "grad_norm": 0.7626116275787354, "learning_rate": 4.986318833456303e-05, "loss": 0.5663, "num_input_tokens_seen": 1131104, "step": 3365 }, { "epoch": 2.6043276661514683, "grad_norm": 1.4797747135162354, "learning_rate": 4.9861421253596854e-05, "loss": 0.6626, "num_input_tokens_seen": 1132608, "step": 3370 }, { "epoch": 2.608191653786708, "grad_norm": 1.1286413669586182, "learning_rate": 4.985964286550741e-05, "loss": 0.9175, "num_input_tokens_seen": 1134176, "step": 3375 }, { "epoch": 2.6120556414219473, "grad_norm": 0.4797998368740082, "learning_rate": 4.98578531711035e-05, "loss": 0.5564, "num_input_tokens_seen": 1135840, "step": 3380 }, { "epoch": 2.615919629057187, "grad_norm": 1.1109439134597778, "learning_rate": 4.985605217119911e-05, "loss": 0.5208, "num_input_tokens_seen": 1137792, "step": 3385 }, { "epoch": 2.6197836166924264, "grad_norm": 0.6518808603286743, "learning_rate": 4.985423986661333e-05, "loss": 0.5531, "num_input_tokens_seen": 1139488, "step": 3390 }, { "epoch": 2.623647604327666, "grad_norm": 0.639610230922699, "learning_rate": 4.985241625817041e-05, "loss": 0.4742, "num_input_tokens_seen": 1141120, "step": 3395 }, { "epoch": 2.6275115919629055, "grad_norm": 0.5176666378974915, "learning_rate": 4.985058134669975e-05, "loss": 0.5513, "num_input_tokens_seen": 1142784, "step": 3400 }, { "epoch": 2.6313755795981453, "grad_norm": 0.8704034090042114, "learning_rate": 4.984873513303586e-05, "loss": 0.4827, "num_input_tokens_seen": 1144320, "step": 3405 }, { "epoch": 2.635239567233385, "grad_norm": 0.660564661026001, "learning_rate": 4.984687761801842e-05, "loss": 0.534, "num_input_tokens_seen": 1146016, "step": 3410 }, { "epoch": 2.6391035548686244, "grad_norm": 0.9027625918388367, "learning_rate": 4.9845008802492245e-05, "loss": 1.0709, "num_input_tokens_seen": 1147712, "step": 3415 }, { "epoch": 2.642967542503864, "grad_norm": 0.75700443983078, "learning_rate": 4.984312868730727e-05, "loss": 0.557, "num_input_tokens_seen": 1149312, "step": 3420 }, { "epoch": 2.6468315301391034, "grad_norm": 0.8463603854179382, "learning_rate": 4.984123727331859e-05, "loss": 0.7364, "num_input_tokens_seen": 1151008, "step": 3425 }, { "epoch": 2.650695517774343, "grad_norm": 0.559880793094635, "learning_rate": 4.983933456138642e-05, "loss": 0.5928, "num_input_tokens_seen": 1152832, "step": 3430 }, { "epoch": 2.6545595054095825, "grad_norm": 1.4359910488128662, "learning_rate": 4.9837420552376144e-05, "loss": 0.7694, "num_input_tokens_seen": 1154432, "step": 3435 }, { "epoch": 2.6584234930448223, "grad_norm": 0.9443642497062683, "learning_rate": 4.983549524715825e-05, "loss": 0.5589, "num_input_tokens_seen": 1156288, "step": 3440 }, { "epoch": 2.662287480680062, "grad_norm": 0.8228883147239685, "learning_rate": 4.983355864660839e-05, "loss": 0.7817, "num_input_tokens_seen": 1157888, "step": 3445 }, { "epoch": 2.6661514683153014, "grad_norm": 0.6609340906143188, "learning_rate": 4.983161075160733e-05, "loss": 0.4492, "num_input_tokens_seen": 1159552, "step": 3450 }, { "epoch": 2.6700154559505407, "grad_norm": 0.6068282723426819, "learning_rate": 4.982965156304099e-05, "loss": 0.5081, "num_input_tokens_seen": 1161184, "step": 3455 }, { "epoch": 2.6738794435857804, "grad_norm": 0.7090804576873779, "learning_rate": 4.9827681081800423e-05, "loss": 0.418, "num_input_tokens_seen": 1163008, "step": 3460 }, { "epoch": 2.67774343122102, "grad_norm": 0.8882567286491394, "learning_rate": 4.982569930878181e-05, "loss": 0.5861, "num_input_tokens_seen": 1165056, "step": 3465 }, { "epoch": 2.6816074188562595, "grad_norm": 0.8435354232788086, "learning_rate": 4.982370624488648e-05, "loss": 0.6947, "num_input_tokens_seen": 1166816, "step": 3470 }, { "epoch": 2.6854714064914993, "grad_norm": 0.7133594155311584, "learning_rate": 4.9821701891020887e-05, "loss": 0.5984, "num_input_tokens_seen": 1168320, "step": 3475 }, { "epoch": 2.689335394126739, "grad_norm": 0.562760055065155, "learning_rate": 4.981968624809662e-05, "loss": 0.4174, "num_input_tokens_seen": 1170112, "step": 3480 }, { "epoch": 2.6931993817619784, "grad_norm": 1.057307243347168, "learning_rate": 4.981765931703041e-05, "loss": 0.4646, "num_input_tokens_seen": 1171904, "step": 3485 }, { "epoch": 2.6970633693972177, "grad_norm": 0.5769866704940796, "learning_rate": 4.9815621098744115e-05, "loss": 0.4754, "num_input_tokens_seen": 1173472, "step": 3490 }, { "epoch": 2.7009273570324575, "grad_norm": 0.7840792536735535, "learning_rate": 4.9813571594164726e-05, "loss": 0.4396, "num_input_tokens_seen": 1175008, "step": 3495 }, { "epoch": 2.704791344667697, "grad_norm": 0.8611481785774231, "learning_rate": 4.981151080422437e-05, "loss": 0.4862, "num_input_tokens_seen": 1176608, "step": 3500 }, { "epoch": 2.7086553323029365, "grad_norm": 1.6045947074890137, "learning_rate": 4.980943872986033e-05, "loss": 0.9338, "num_input_tokens_seen": 1178240, "step": 3505 }, { "epoch": 2.7125193199381763, "grad_norm": 0.44014978408813477, "learning_rate": 4.980735537201495e-05, "loss": 0.498, "num_input_tokens_seen": 1179744, "step": 3510 }, { "epoch": 2.7163833075734156, "grad_norm": 1.0759127140045166, "learning_rate": 4.9805260731635794e-05, "loss": 0.6792, "num_input_tokens_seen": 1181312, "step": 3515 }, { "epoch": 2.7202472952086554, "grad_norm": 1.0745078325271606, "learning_rate": 4.980315480967551e-05, "loss": 0.8285, "num_input_tokens_seen": 1182784, "step": 3520 }, { "epoch": 2.7241112828438947, "grad_norm": 0.687350332736969, "learning_rate": 4.980103760709187e-05, "loss": 0.4955, "num_input_tokens_seen": 1184448, "step": 3525 }, { "epoch": 2.7279752704791345, "grad_norm": 0.3752714991569519, "learning_rate": 4.9798909124847804e-05, "loss": 0.6341, "num_input_tokens_seen": 1185952, "step": 3530 }, { "epoch": 2.7318392581143742, "grad_norm": 0.4747561514377594, "learning_rate": 4.979676936391135e-05, "loss": 0.4625, "num_input_tokens_seen": 1187776, "step": 3535 }, { "epoch": 2.7357032457496135, "grad_norm": 0.604560136795044, "learning_rate": 4.979461832525569e-05, "loss": 0.4366, "num_input_tokens_seen": 1189376, "step": 3540 }, { "epoch": 2.7395672333848533, "grad_norm": 0.5885705351829529, "learning_rate": 4.9792456009859126e-05, "loss": 0.4993, "num_input_tokens_seen": 1190784, "step": 3545 }, { "epoch": 2.7434312210200926, "grad_norm": 0.6086111664772034, "learning_rate": 4.979028241870509e-05, "loss": 0.4174, "num_input_tokens_seen": 1192384, "step": 3550 }, { "epoch": 2.7472952086553324, "grad_norm": 1.881892204284668, "learning_rate": 4.978809755278215e-05, "loss": 0.6374, "num_input_tokens_seen": 1194240, "step": 3555 }, { "epoch": 2.7511591962905717, "grad_norm": 1.292818546295166, "learning_rate": 4.978590141308399e-05, "loss": 0.5888, "num_input_tokens_seen": 1195680, "step": 3560 }, { "epoch": 2.7550231839258115, "grad_norm": 0.7079321146011353, "learning_rate": 4.978369400060943e-05, "loss": 0.6528, "num_input_tokens_seen": 1197408, "step": 3565 }, { "epoch": 2.7588871715610512, "grad_norm": 0.4388181269168854, "learning_rate": 4.978147531636241e-05, "loss": 0.6013, "num_input_tokens_seen": 1199040, "step": 3570 }, { "epoch": 2.7627511591962906, "grad_norm": 1.160688042640686, "learning_rate": 4.977924536135202e-05, "loss": 0.6527, "num_input_tokens_seen": 1200640, "step": 3575 }, { "epoch": 2.76661514683153, "grad_norm": 0.6386562585830688, "learning_rate": 4.977700413659243e-05, "loss": 0.4499, "num_input_tokens_seen": 1202496, "step": 3580 }, { "epoch": 2.7704791344667696, "grad_norm": 1.24155855178833, "learning_rate": 4.977475164310298e-05, "loss": 0.5857, "num_input_tokens_seen": 1204096, "step": 3585 }, { "epoch": 2.7743431221020094, "grad_norm": 0.9279860854148865, "learning_rate": 4.9772487881908115e-05, "loss": 0.4689, "num_input_tokens_seen": 1205760, "step": 3590 }, { "epoch": 2.7782071097372487, "grad_norm": 0.8585197329521179, "learning_rate": 4.97702128540374e-05, "loss": 0.5638, "num_input_tokens_seen": 1207392, "step": 3595 }, { "epoch": 2.7820710973724885, "grad_norm": 0.5836890339851379, "learning_rate": 4.9767926560525536e-05, "loss": 0.4829, "num_input_tokens_seen": 1209152, "step": 3600 }, { "epoch": 2.7859350850077282, "grad_norm": 0.6728211045265198, "learning_rate": 4.9765629002412346e-05, "loss": 0.4758, "num_input_tokens_seen": 1210784, "step": 3605 }, { "epoch": 2.7897990726429676, "grad_norm": 0.5542731285095215, "learning_rate": 4.976332018074277e-05, "loss": 0.419, "num_input_tokens_seen": 1212320, "step": 3610 }, { "epoch": 2.793663060278207, "grad_norm": 1.1383287906646729, "learning_rate": 4.976100009656687e-05, "loss": 0.5553, "num_input_tokens_seen": 1214368, "step": 3615 }, { "epoch": 2.7975270479134466, "grad_norm": 1.2590012550354004, "learning_rate": 4.975866875093984e-05, "loss": 0.6298, "num_input_tokens_seen": 1215936, "step": 3620 }, { "epoch": 2.8013910355486864, "grad_norm": 0.6190434694290161, "learning_rate": 4.975632614492199e-05, "loss": 0.6616, "num_input_tokens_seen": 1217696, "step": 3625 }, { "epoch": 2.8052550231839257, "grad_norm": 1.4215384721755981, "learning_rate": 4.975397227957875e-05, "loss": 0.5881, "num_input_tokens_seen": 1219232, "step": 3630 }, { "epoch": 2.8091190108191655, "grad_norm": 0.6871613264083862, "learning_rate": 4.9751607155980676e-05, "loss": 0.7331, "num_input_tokens_seen": 1220928, "step": 3635 }, { "epoch": 2.812982998454405, "grad_norm": 1.0291574001312256, "learning_rate": 4.9749230775203425e-05, "loss": 0.6462, "num_input_tokens_seen": 1222656, "step": 3640 }, { "epoch": 2.8168469860896446, "grad_norm": 0.9636913537979126, "learning_rate": 4.9746843138327806e-05, "loss": 0.5418, "num_input_tokens_seen": 1224448, "step": 3645 }, { "epoch": 2.820710973724884, "grad_norm": 1.3144199848175049, "learning_rate": 4.974444424643973e-05, "loss": 0.4525, "num_input_tokens_seen": 1226016, "step": 3650 }, { "epoch": 2.8245749613601236, "grad_norm": 0.5493863224983215, "learning_rate": 4.974203410063021e-05, "loss": 0.4169, "num_input_tokens_seen": 1227968, "step": 3655 }, { "epoch": 2.8284389489953634, "grad_norm": 0.7186553478240967, "learning_rate": 4.9739612701995414e-05, "loss": 0.4147, "num_input_tokens_seen": 1229600, "step": 3660 }, { "epoch": 2.8323029366306027, "grad_norm": 0.7278918623924255, "learning_rate": 4.97371800516366e-05, "loss": 0.4865, "num_input_tokens_seen": 1231328, "step": 3665 }, { "epoch": 2.8361669242658425, "grad_norm": 0.5465059876441956, "learning_rate": 4.973473615066015e-05, "loss": 0.4228, "num_input_tokens_seen": 1232704, "step": 3670 }, { "epoch": 2.840030911901082, "grad_norm": 1.7068525552749634, "learning_rate": 4.973228100017757e-05, "loss": 0.6036, "num_input_tokens_seen": 1234144, "step": 3675 }, { "epoch": 2.8438948995363216, "grad_norm": 0.7545850872993469, "learning_rate": 4.972981460130548e-05, "loss": 0.5032, "num_input_tokens_seen": 1235872, "step": 3680 }, { "epoch": 2.847758887171561, "grad_norm": 0.7056130766868591, "learning_rate": 4.9727336955165606e-05, "loss": 0.4416, "num_input_tokens_seen": 1237536, "step": 3685 }, { "epoch": 2.8516228748068007, "grad_norm": 0.5779761672019958, "learning_rate": 4.97248480628848e-05, "loss": 0.5039, "num_input_tokens_seen": 1239104, "step": 3690 }, { "epoch": 2.8554868624420404, "grad_norm": 0.675998866558075, "learning_rate": 4.972234792559503e-05, "loss": 0.6638, "num_input_tokens_seen": 1240960, "step": 3695 }, { "epoch": 2.8593508500772797, "grad_norm": 0.743076741695404, "learning_rate": 4.971983654443335e-05, "loss": 0.6019, "num_input_tokens_seen": 1242528, "step": 3700 }, { "epoch": 2.863214837712519, "grad_norm": 0.7421184182167053, "learning_rate": 4.971731392054198e-05, "loss": 0.5063, "num_input_tokens_seen": 1244416, "step": 3705 }, { "epoch": 2.867078825347759, "grad_norm": 1.1630306243896484, "learning_rate": 4.971478005506821e-05, "loss": 0.5796, "num_input_tokens_seen": 1246208, "step": 3710 }, { "epoch": 2.8709428129829986, "grad_norm": 0.8378121256828308, "learning_rate": 4.971223494916446e-05, "loss": 0.5494, "num_input_tokens_seen": 1247680, "step": 3715 }, { "epoch": 2.874806800618238, "grad_norm": 0.8370673656463623, "learning_rate": 4.970967860398825e-05, "loss": 0.5313, "num_input_tokens_seen": 1249600, "step": 3720 }, { "epoch": 2.8786707882534777, "grad_norm": 1.018075704574585, "learning_rate": 4.9707111020702245e-05, "loss": 0.5434, "num_input_tokens_seen": 1251328, "step": 3725 }, { "epoch": 2.8825347758887174, "grad_norm": 1.0961360931396484, "learning_rate": 4.970453220047417e-05, "loss": 0.5645, "num_input_tokens_seen": 1252928, "step": 3730 }, { "epoch": 2.8863987635239567, "grad_norm": 0.7587810158729553, "learning_rate": 4.970194214447691e-05, "loss": 0.7578, "num_input_tokens_seen": 1254912, "step": 3735 }, { "epoch": 2.890262751159196, "grad_norm": 0.650869607925415, "learning_rate": 4.9699340853888435e-05, "loss": 0.9094, "num_input_tokens_seen": 1256672, "step": 3740 }, { "epoch": 2.894126738794436, "grad_norm": 0.7845134735107422, "learning_rate": 4.9696728329891806e-05, "loss": 0.5603, "num_input_tokens_seen": 1258432, "step": 3745 }, { "epoch": 2.8979907264296756, "grad_norm": 0.7018628716468811, "learning_rate": 4.9694104573675236e-05, "loss": 0.6671, "num_input_tokens_seen": 1260192, "step": 3750 }, { "epoch": 2.901854714064915, "grad_norm": 1.0875087976455688, "learning_rate": 4.9691469586432025e-05, "loss": 0.4787, "num_input_tokens_seen": 1261984, "step": 3755 }, { "epoch": 2.9057187017001547, "grad_norm": 0.9164471626281738, "learning_rate": 4.968882336936056e-05, "loss": 0.4571, "num_input_tokens_seen": 1263584, "step": 3760 }, { "epoch": 2.909582689335394, "grad_norm": 0.5696697235107422, "learning_rate": 4.968616592366439e-05, "loss": 0.6024, "num_input_tokens_seen": 1265184, "step": 3765 }, { "epoch": 2.9134466769706338, "grad_norm": 0.8346107602119446, "learning_rate": 4.96834972505521e-05, "loss": 0.4952, "num_input_tokens_seen": 1266784, "step": 3770 }, { "epoch": 2.917310664605873, "grad_norm": 1.0693782567977905, "learning_rate": 4.968081735123745e-05, "loss": 0.4917, "num_input_tokens_seen": 1268448, "step": 3775 }, { "epoch": 2.921174652241113, "grad_norm": 0.8107241988182068, "learning_rate": 4.9678126226939255e-05, "loss": 0.4647, "num_input_tokens_seen": 1270176, "step": 3780 }, { "epoch": 2.9250386398763526, "grad_norm": 0.5345110893249512, "learning_rate": 4.967542387888146e-05, "loss": 0.5917, "num_input_tokens_seen": 1272032, "step": 3785 }, { "epoch": 2.928902627511592, "grad_norm": 0.7002302408218384, "learning_rate": 4.9672710308293115e-05, "loss": 0.5415, "num_input_tokens_seen": 1273504, "step": 3790 }, { "epoch": 2.9327666151468317, "grad_norm": 1.468747854232788, "learning_rate": 4.966998551640836e-05, "loss": 0.5125, "num_input_tokens_seen": 1275328, "step": 3795 }, { "epoch": 2.936630602782071, "grad_norm": 0.660334050655365, "learning_rate": 4.966724950446644e-05, "loss": 0.6857, "num_input_tokens_seen": 1277056, "step": 3800 }, { "epoch": 2.9404945904173108, "grad_norm": 1.1957876682281494, "learning_rate": 4.9664502273711735e-05, "loss": 0.7844, "num_input_tokens_seen": 1278816, "step": 3805 }, { "epoch": 2.94435857805255, "grad_norm": 0.6576964259147644, "learning_rate": 4.966174382539367e-05, "loss": 0.6998, "num_input_tokens_seen": 1280800, "step": 3810 }, { "epoch": 2.94822256568779, "grad_norm": 0.8073583841323853, "learning_rate": 4.965897416076683e-05, "loss": 0.6345, "num_input_tokens_seen": 1282560, "step": 3815 }, { "epoch": 2.9520865533230296, "grad_norm": 0.8566087484359741, "learning_rate": 4.965619328109086e-05, "loss": 0.7406, "num_input_tokens_seen": 1284256, "step": 3820 }, { "epoch": 2.955950540958269, "grad_norm": 0.5019268989562988, "learning_rate": 4.9653401187630535e-05, "loss": 0.593, "num_input_tokens_seen": 1285792, "step": 3825 }, { "epoch": 2.9598145285935082, "grad_norm": 0.6868906617164612, "learning_rate": 4.965059788165569e-05, "loss": 0.5807, "num_input_tokens_seen": 1287744, "step": 3830 }, { "epoch": 2.963678516228748, "grad_norm": 0.4995618760585785, "learning_rate": 4.9647783364441315e-05, "loss": 0.5419, "num_input_tokens_seen": 1289152, "step": 3835 }, { "epoch": 2.9675425038639878, "grad_norm": 0.9456138610839844, "learning_rate": 4.964495763726745e-05, "loss": 0.4905, "num_input_tokens_seen": 1290976, "step": 3840 }, { "epoch": 2.971406491499227, "grad_norm": 0.714795708656311, "learning_rate": 4.964212070141927e-05, "loss": 0.6204, "num_input_tokens_seen": 1292576, "step": 3845 }, { "epoch": 2.975270479134467, "grad_norm": 0.5990883708000183, "learning_rate": 4.963927255818701e-05, "loss": 0.5856, "num_input_tokens_seen": 1294112, "step": 3850 }, { "epoch": 2.9791344667697066, "grad_norm": 0.6102612018585205, "learning_rate": 4.9636413208866026e-05, "loss": 0.6263, "num_input_tokens_seen": 1295872, "step": 3855 }, { "epoch": 2.982998454404946, "grad_norm": 0.6648209691047668, "learning_rate": 4.963354265475678e-05, "loss": 0.5096, "num_input_tokens_seen": 1297472, "step": 3860 }, { "epoch": 2.9868624420401853, "grad_norm": 1.032414197921753, "learning_rate": 4.963066089716481e-05, "loss": 0.6407, "num_input_tokens_seen": 1299104, "step": 3865 }, { "epoch": 2.990726429675425, "grad_norm": 0.8999525904655457, "learning_rate": 4.9627767937400754e-05, "loss": 0.4826, "num_input_tokens_seen": 1300512, "step": 3870 }, { "epoch": 2.9945904173106648, "grad_norm": 0.7242717146873474, "learning_rate": 4.962486377678035e-05, "loss": 0.4502, "num_input_tokens_seen": 1302080, "step": 3875 }, { "epoch": 2.998454404945904, "grad_norm": 0.7318016290664673, "learning_rate": 4.962194841662443e-05, "loss": 0.4608, "num_input_tokens_seen": 1303744, "step": 3880 }, { "epoch": 3.0, "eval_loss": 0.5673310160636902, "eval_runtime": 6.369, "eval_samples_per_second": 90.281, "eval_steps_per_second": 22.609, "num_input_tokens_seen": 1304160, "step": 3882 }, { "epoch": 3.002318392581144, "grad_norm": 0.8186526894569397, "learning_rate": 4.961902185825892e-05, "loss": 0.4906, "num_input_tokens_seen": 1305056, "step": 3885 }, { "epoch": 3.006182380216383, "grad_norm": 0.8218995928764343, "learning_rate": 4.961608410301482e-05, "loss": 0.511, "num_input_tokens_seen": 1306912, "step": 3890 }, { "epoch": 3.010046367851623, "grad_norm": 0.6464802026748657, "learning_rate": 4.961313515222826e-05, "loss": 0.6282, "num_input_tokens_seen": 1308608, "step": 3895 }, { "epoch": 3.0139103554868623, "grad_norm": 1.1723092794418335, "learning_rate": 4.9610175007240424e-05, "loss": 1.0862, "num_input_tokens_seen": 1310464, "step": 3900 }, { "epoch": 3.017774343122102, "grad_norm": 1.6918052434921265, "learning_rate": 4.960720366939762e-05, "loss": 0.4976, "num_input_tokens_seen": 1312192, "step": 3905 }, { "epoch": 3.021638330757342, "grad_norm": 1.2022000551223755, "learning_rate": 4.960422114005121e-05, "loss": 0.8687, "num_input_tokens_seen": 1313856, "step": 3910 }, { "epoch": 3.025502318392581, "grad_norm": 1.0540682077407837, "learning_rate": 4.9601227420557675e-05, "loss": 0.4578, "num_input_tokens_seen": 1315552, "step": 3915 }, { "epoch": 3.029366306027821, "grad_norm": 0.5668230652809143, "learning_rate": 4.959822251227858e-05, "loss": 0.3935, "num_input_tokens_seen": 1317152, "step": 3920 }, { "epoch": 3.03323029366306, "grad_norm": 0.708391010761261, "learning_rate": 4.959520641658058e-05, "loss": 0.5329, "num_input_tokens_seen": 1318720, "step": 3925 }, { "epoch": 3.0370942812983, "grad_norm": 0.5779711008071899, "learning_rate": 4.9592179134835406e-05, "loss": 0.4852, "num_input_tokens_seen": 1320480, "step": 3930 }, { "epoch": 3.0409582689335393, "grad_norm": 0.6989569067955017, "learning_rate": 4.958914066841988e-05, "loss": 0.5026, "num_input_tokens_seen": 1322272, "step": 3935 }, { "epoch": 3.044822256568779, "grad_norm": 0.5338247418403625, "learning_rate": 4.9586091018715916e-05, "loss": 0.4168, "num_input_tokens_seen": 1324032, "step": 3940 }, { "epoch": 3.0486862442040183, "grad_norm": 0.6969642043113708, "learning_rate": 4.9583030187110525e-05, "loss": 0.4722, "num_input_tokens_seen": 1325472, "step": 3945 }, { "epoch": 3.052550231839258, "grad_norm": 0.5268831849098206, "learning_rate": 4.957995817499578e-05, "loss": 0.4212, "num_input_tokens_seen": 1326976, "step": 3950 }, { "epoch": 3.056414219474498, "grad_norm": 1.0191632509231567, "learning_rate": 4.957687498376886e-05, "loss": 0.449, "num_input_tokens_seen": 1328608, "step": 3955 }, { "epoch": 3.060278207109737, "grad_norm": 0.6664442420005798, "learning_rate": 4.9573780614832e-05, "loss": 0.5315, "num_input_tokens_seen": 1330080, "step": 3960 }, { "epoch": 3.064142194744977, "grad_norm": 0.5174227952957153, "learning_rate": 4.9570675069592553e-05, "loss": 0.6284, "num_input_tokens_seen": 1331776, "step": 3965 }, { "epoch": 3.0680061823802163, "grad_norm": 0.776060938835144, "learning_rate": 4.956755834946294e-05, "loss": 0.5158, "num_input_tokens_seen": 1333472, "step": 3970 }, { "epoch": 3.071870170015456, "grad_norm": 0.6581965088844299, "learning_rate": 4.9564430455860655e-05, "loss": 0.4797, "num_input_tokens_seen": 1335328, "step": 3975 }, { "epoch": 3.0757341576506954, "grad_norm": 1.386012077331543, "learning_rate": 4.95612913902083e-05, "loss": 0.4625, "num_input_tokens_seen": 1336992, "step": 3980 }, { "epoch": 3.079598145285935, "grad_norm": 0.5497900247573853, "learning_rate": 4.9558141153933515e-05, "loss": 0.5764, "num_input_tokens_seen": 1338464, "step": 3985 }, { "epoch": 3.0834621329211744, "grad_norm": 1.1419408321380615, "learning_rate": 4.955497974846907e-05, "loss": 0.5544, "num_input_tokens_seen": 1340192, "step": 3990 }, { "epoch": 3.087326120556414, "grad_norm": 0.8088237643241882, "learning_rate": 4.955180717525277e-05, "loss": 0.6095, "num_input_tokens_seen": 1341920, "step": 3995 }, { "epoch": 3.091190108191654, "grad_norm": 0.9308410286903381, "learning_rate": 4.954862343572755e-05, "loss": 0.47, "num_input_tokens_seen": 1343552, "step": 4000 }, { "epoch": 3.0950540958268933, "grad_norm": 0.7645550966262817, "learning_rate": 4.954542853134136e-05, "loss": 0.6207, "num_input_tokens_seen": 1345152, "step": 4005 }, { "epoch": 3.098918083462133, "grad_norm": 1.1955641508102417, "learning_rate": 4.9542222463547286e-05, "loss": 0.6137, "num_input_tokens_seen": 1346720, "step": 4010 }, { "epoch": 3.1027820710973724, "grad_norm": 0.7579068541526794, "learning_rate": 4.953900523380345e-05, "loss": 0.8345, "num_input_tokens_seen": 1348384, "step": 4015 }, { "epoch": 3.106646058732612, "grad_norm": 0.811932384967804, "learning_rate": 4.953577684357308e-05, "loss": 0.4569, "num_input_tokens_seen": 1350208, "step": 4020 }, { "epoch": 3.1105100463678514, "grad_norm": 0.6110729575157166, "learning_rate": 4.9532537294324456e-05, "loss": 0.5261, "num_input_tokens_seen": 1351904, "step": 4025 }, { "epoch": 3.114374034003091, "grad_norm": 1.1287775039672852, "learning_rate": 4.9529286587530955e-05, "loss": 0.5647, "num_input_tokens_seen": 1353792, "step": 4030 }, { "epoch": 3.118238021638331, "grad_norm": 0.6432676315307617, "learning_rate": 4.9526024724671014e-05, "loss": 0.469, "num_input_tokens_seen": 1355392, "step": 4035 }, { "epoch": 3.1221020092735703, "grad_norm": 0.5093907117843628, "learning_rate": 4.952275170722815e-05, "loss": 0.4117, "num_input_tokens_seen": 1357216, "step": 4040 }, { "epoch": 3.12596599690881, "grad_norm": 0.6048135757446289, "learning_rate": 4.951946753669095e-05, "loss": 0.5241, "num_input_tokens_seen": 1359040, "step": 4045 }, { "epoch": 3.1298299845440494, "grad_norm": 0.9261878132820129, "learning_rate": 4.951617221455307e-05, "loss": 0.4688, "num_input_tokens_seen": 1360704, "step": 4050 }, { "epoch": 3.133693972179289, "grad_norm": 1.6219602823257446, "learning_rate": 4.951286574231325e-05, "loss": 0.6421, "num_input_tokens_seen": 1362624, "step": 4055 }, { "epoch": 3.1375579598145285, "grad_norm": 1.2112438678741455, "learning_rate": 4.950954812147528e-05, "loss": 0.6916, "num_input_tokens_seen": 1364128, "step": 4060 }, { "epoch": 3.141421947449768, "grad_norm": 0.7805593609809875, "learning_rate": 4.9506219353548045e-05, "loss": 0.4493, "num_input_tokens_seen": 1365664, "step": 4065 }, { "epoch": 3.1452859350850075, "grad_norm": 0.8171101808547974, "learning_rate": 4.9502879440045494e-05, "loss": 0.61, "num_input_tokens_seen": 1367360, "step": 4070 }, { "epoch": 3.1491499227202473, "grad_norm": 0.7252092957496643, "learning_rate": 4.9499528382486624e-05, "loss": 0.6998, "num_input_tokens_seen": 1368896, "step": 4075 }, { "epoch": 3.153013910355487, "grad_norm": 0.5572108030319214, "learning_rate": 4.949616618239552e-05, "loss": 0.5005, "num_input_tokens_seen": 1370432, "step": 4080 }, { "epoch": 3.1568778979907264, "grad_norm": 0.5730881094932556, "learning_rate": 4.949279284130134e-05, "loss": 0.4971, "num_input_tokens_seen": 1372352, "step": 4085 }, { "epoch": 3.160741885625966, "grad_norm": 0.4607718586921692, "learning_rate": 4.94894083607383e-05, "loss": 0.4798, "num_input_tokens_seen": 1373856, "step": 4090 }, { "epoch": 3.1646058732612055, "grad_norm": 0.9823755621910095, "learning_rate": 4.948601274224567e-05, "loss": 0.5339, "num_input_tokens_seen": 1375456, "step": 4095 }, { "epoch": 3.1684698608964452, "grad_norm": 0.962943434715271, "learning_rate": 4.94826059873678e-05, "loss": 0.5944, "num_input_tokens_seen": 1377120, "step": 4100 }, { "epoch": 3.1723338485316845, "grad_norm": 0.8753464818000793, "learning_rate": 4.947918809765411e-05, "loss": 0.4141, "num_input_tokens_seen": 1378624, "step": 4105 }, { "epoch": 3.1761978361669243, "grad_norm": 1.3557876348495483, "learning_rate": 4.9475759074659076e-05, "loss": 0.6744, "num_input_tokens_seen": 1380448, "step": 4110 }, { "epoch": 3.1800618238021636, "grad_norm": 0.6762344837188721, "learning_rate": 4.947231891994223e-05, "loss": 0.448, "num_input_tokens_seen": 1381792, "step": 4115 }, { "epoch": 3.1839258114374034, "grad_norm": 0.5630887150764465, "learning_rate": 4.946886763506818e-05, "loss": 0.4611, "num_input_tokens_seen": 1383424, "step": 4120 }, { "epoch": 3.187789799072643, "grad_norm": 0.5391241312026978, "learning_rate": 4.94654052216066e-05, "loss": 0.4862, "num_input_tokens_seen": 1385184, "step": 4125 }, { "epoch": 3.1916537867078825, "grad_norm": 1.3723623752593994, "learning_rate": 4.94619316811322e-05, "loss": 0.7381, "num_input_tokens_seen": 1386944, "step": 4130 }, { "epoch": 3.1955177743431222, "grad_norm": 1.1652849912643433, "learning_rate": 4.9458447015224776e-05, "loss": 0.7871, "num_input_tokens_seen": 1388480, "step": 4135 }, { "epoch": 3.1993817619783615, "grad_norm": 1.4400334358215332, "learning_rate": 4.945495122546917e-05, "loss": 0.7292, "num_input_tokens_seen": 1390400, "step": 4140 }, { "epoch": 3.2032457496136013, "grad_norm": 0.598832368850708, "learning_rate": 4.9451444313455295e-05, "loss": 0.6614, "num_input_tokens_seen": 1392032, "step": 4145 }, { "epoch": 3.2071097372488406, "grad_norm": 0.8857510089874268, "learning_rate": 4.944792628077811e-05, "loss": 0.6315, "num_input_tokens_seen": 1393792, "step": 4150 }, { "epoch": 3.2109737248840804, "grad_norm": 0.9653303623199463, "learning_rate": 4.9444397129037645e-05, "loss": 0.5386, "num_input_tokens_seen": 1395168, "step": 4155 }, { "epoch": 3.21483771251932, "grad_norm": 0.765886664390564, "learning_rate": 4.944085685983898e-05, "loss": 0.627, "num_input_tokens_seen": 1396544, "step": 4160 }, { "epoch": 3.2187017001545595, "grad_norm": 0.5871866941452026, "learning_rate": 4.9437305474792225e-05, "loss": 0.6129, "num_input_tokens_seen": 1398368, "step": 4165 }, { "epoch": 3.2225656877897992, "grad_norm": 0.573396623134613, "learning_rate": 4.943374297551261e-05, "loss": 0.4052, "num_input_tokens_seen": 1399808, "step": 4170 }, { "epoch": 3.2264296754250386, "grad_norm": 1.0835139751434326, "learning_rate": 4.943016936362035e-05, "loss": 0.4784, "num_input_tokens_seen": 1401440, "step": 4175 }, { "epoch": 3.2302936630602783, "grad_norm": 0.8448111414909363, "learning_rate": 4.942658464074076e-05, "loss": 0.3799, "num_input_tokens_seen": 1402976, "step": 4180 }, { "epoch": 3.2341576506955176, "grad_norm": 1.0080831050872803, "learning_rate": 4.942298880850419e-05, "loss": 0.7704, "num_input_tokens_seen": 1404512, "step": 4185 }, { "epoch": 3.2380216383307574, "grad_norm": 0.8291221857070923, "learning_rate": 4.941938186854605e-05, "loss": 0.5222, "num_input_tokens_seen": 1406144, "step": 4190 }, { "epoch": 3.2418856259659967, "grad_norm": 0.6279733777046204, "learning_rate": 4.941576382250679e-05, "loss": 0.4568, "num_input_tokens_seen": 1407840, "step": 4195 }, { "epoch": 3.2457496136012365, "grad_norm": 0.6190840005874634, "learning_rate": 4.941213467203193e-05, "loss": 0.6406, "num_input_tokens_seen": 1409792, "step": 4200 }, { "epoch": 3.2496136012364762, "grad_norm": 1.178888201713562, "learning_rate": 4.940849441877201e-05, "loss": 0.5082, "num_input_tokens_seen": 1411552, "step": 4205 }, { "epoch": 3.2534775888717156, "grad_norm": 1.274207592010498, "learning_rate": 4.940484306438266e-05, "loss": 0.7698, "num_input_tokens_seen": 1413120, "step": 4210 }, { "epoch": 3.2573415765069553, "grad_norm": 0.7018077373504639, "learning_rate": 4.940118061052453e-05, "loss": 0.392, "num_input_tokens_seen": 1414592, "step": 4215 }, { "epoch": 3.2612055641421946, "grad_norm": 1.1332892179489136, "learning_rate": 4.939750705886332e-05, "loss": 0.4307, "num_input_tokens_seen": 1416416, "step": 4220 }, { "epoch": 3.2650695517774344, "grad_norm": 0.7639138698577881, "learning_rate": 4.9393822411069794e-05, "loss": 0.6609, "num_input_tokens_seen": 1417888, "step": 4225 }, { "epoch": 3.2689335394126737, "grad_norm": 1.0726195573806763, "learning_rate": 4.939012666881975e-05, "loss": 0.5792, "num_input_tokens_seen": 1419552, "step": 4230 }, { "epoch": 3.2727975270479135, "grad_norm": 0.9556975960731506, "learning_rate": 4.938641983379402e-05, "loss": 0.4161, "num_input_tokens_seen": 1421152, "step": 4235 }, { "epoch": 3.276661514683153, "grad_norm": 0.8239498138427734, "learning_rate": 4.9382701907678514e-05, "loss": 0.4735, "num_input_tokens_seen": 1423008, "step": 4240 }, { "epoch": 3.2805255023183926, "grad_norm": 1.0081562995910645, "learning_rate": 4.9378972892164156e-05, "loss": 0.4651, "num_input_tokens_seen": 1424512, "step": 4245 }, { "epoch": 3.2843894899536323, "grad_norm": 0.6779324412345886, "learning_rate": 4.9375232788946926e-05, "loss": 0.4196, "num_input_tokens_seen": 1426336, "step": 4250 }, { "epoch": 3.2882534775888717, "grad_norm": 0.7797317504882812, "learning_rate": 4.937148159972784e-05, "loss": 0.45, "num_input_tokens_seen": 1428352, "step": 4255 }, { "epoch": 3.2921174652241114, "grad_norm": 0.6227105855941772, "learning_rate": 4.936771932621297e-05, "loss": 0.5453, "num_input_tokens_seen": 1430048, "step": 4260 }, { "epoch": 3.2959814528593507, "grad_norm": 0.8546170592308044, "learning_rate": 4.936394597011342e-05, "loss": 0.577, "num_input_tokens_seen": 1431744, "step": 4265 }, { "epoch": 3.2998454404945905, "grad_norm": 0.9970516562461853, "learning_rate": 4.936016153314534e-05, "loss": 0.6198, "num_input_tokens_seen": 1433248, "step": 4270 }, { "epoch": 3.30370942812983, "grad_norm": 0.6481711268424988, "learning_rate": 4.9356366017029897e-05, "loss": 0.448, "num_input_tokens_seen": 1434912, "step": 4275 }, { "epoch": 3.3075734157650696, "grad_norm": 0.8824865818023682, "learning_rate": 4.9352559423493326e-05, "loss": 0.5552, "num_input_tokens_seen": 1436576, "step": 4280 }, { "epoch": 3.3114374034003093, "grad_norm": 0.7782494425773621, "learning_rate": 4.934874175426689e-05, "loss": 0.4449, "num_input_tokens_seen": 1438208, "step": 4285 }, { "epoch": 3.3153013910355487, "grad_norm": 0.951985239982605, "learning_rate": 4.9344913011086894e-05, "loss": 0.6209, "num_input_tokens_seen": 1439904, "step": 4290 }, { "epoch": 3.3191653786707884, "grad_norm": 0.7611915469169617, "learning_rate": 4.934107319569465e-05, "loss": 0.5732, "num_input_tokens_seen": 1441600, "step": 4295 }, { "epoch": 3.3230293663060277, "grad_norm": 0.9421367645263672, "learning_rate": 4.9337222309836554e-05, "loss": 0.6078, "num_input_tokens_seen": 1443296, "step": 4300 }, { "epoch": 3.3268933539412675, "grad_norm": 0.6620069146156311, "learning_rate": 4.933336035526399e-05, "loss": 0.6678, "num_input_tokens_seen": 1444800, "step": 4305 }, { "epoch": 3.330757341576507, "grad_norm": 0.8706424832344055, "learning_rate": 4.932948733373342e-05, "loss": 0.468, "num_input_tokens_seen": 1446368, "step": 4310 }, { "epoch": 3.3346213292117466, "grad_norm": 0.7124268412590027, "learning_rate": 4.93256032470063e-05, "loss": 0.443, "num_input_tokens_seen": 1447776, "step": 4315 }, { "epoch": 3.338485316846986, "grad_norm": 0.64589524269104, "learning_rate": 4.932170809684915e-05, "loss": 0.5139, "num_input_tokens_seen": 1449408, "step": 4320 }, { "epoch": 3.3423493044822257, "grad_norm": 0.6333146095275879, "learning_rate": 4.93178018850335e-05, "loss": 0.3798, "num_input_tokens_seen": 1451424, "step": 4325 }, { "epoch": 3.346213292117465, "grad_norm": 0.6601765751838684, "learning_rate": 4.931388461333591e-05, "loss": 0.6153, "num_input_tokens_seen": 1452896, "step": 4330 }, { "epoch": 3.3500772797527048, "grad_norm": 0.7004759311676025, "learning_rate": 4.9309956283538e-05, "loss": 0.5392, "num_input_tokens_seen": 1454592, "step": 4335 }, { "epoch": 3.3539412673879445, "grad_norm": 0.7571190595626831, "learning_rate": 4.9306016897426375e-05, "loss": 0.4855, "num_input_tokens_seen": 1456320, "step": 4340 }, { "epoch": 3.357805255023184, "grad_norm": 0.5437739491462708, "learning_rate": 4.930206645679271e-05, "loss": 0.5776, "num_input_tokens_seen": 1457696, "step": 4345 }, { "epoch": 3.3616692426584236, "grad_norm": 0.7821775078773499, "learning_rate": 4.929810496343368e-05, "loss": 0.438, "num_input_tokens_seen": 1459168, "step": 4350 }, { "epoch": 3.365533230293663, "grad_norm": 0.7689463496208191, "learning_rate": 4.9294132419150995e-05, "loss": 0.7763, "num_input_tokens_seen": 1460960, "step": 4355 }, { "epoch": 3.3693972179289027, "grad_norm": 1.0673933029174805, "learning_rate": 4.929014882575139e-05, "loss": 0.6632, "num_input_tokens_seen": 1462880, "step": 4360 }, { "epoch": 3.373261205564142, "grad_norm": 0.9013523459434509, "learning_rate": 4.928615418504664e-05, "loss": 0.5263, "num_input_tokens_seen": 1464352, "step": 4365 }, { "epoch": 3.3771251931993818, "grad_norm": 0.8133090138435364, "learning_rate": 4.9282148498853513e-05, "loss": 0.5695, "num_input_tokens_seen": 1465952, "step": 4370 }, { "epoch": 3.3809891808346215, "grad_norm": 0.8100396394729614, "learning_rate": 4.927813176899383e-05, "loss": 0.4483, "num_input_tokens_seen": 1467616, "step": 4375 }, { "epoch": 3.384853168469861, "grad_norm": 0.666679322719574, "learning_rate": 4.927410399729443e-05, "loss": 0.6444, "num_input_tokens_seen": 1469344, "step": 4380 }, { "epoch": 3.3887171561051006, "grad_norm": 0.39717450737953186, "learning_rate": 4.9270065185587154e-05, "loss": 0.3962, "num_input_tokens_seen": 1471200, "step": 4385 }, { "epoch": 3.39258114374034, "grad_norm": 0.6371679306030273, "learning_rate": 4.9266015335708884e-05, "loss": 0.5858, "num_input_tokens_seen": 1472896, "step": 4390 }, { "epoch": 3.3964451313755797, "grad_norm": 0.9276110529899597, "learning_rate": 4.9261954449501525e-05, "loss": 0.5187, "num_input_tokens_seen": 1474688, "step": 4395 }, { "epoch": 3.400309119010819, "grad_norm": 0.48789599537849426, "learning_rate": 4.925788252881197e-05, "loss": 0.4239, "num_input_tokens_seen": 1476576, "step": 4400 }, { "epoch": 3.4041731066460588, "grad_norm": 0.549759030342102, "learning_rate": 4.925379957549217e-05, "loss": 0.4592, "num_input_tokens_seen": 1478304, "step": 4405 }, { "epoch": 3.4080370942812985, "grad_norm": 0.5163140892982483, "learning_rate": 4.924970559139908e-05, "loss": 0.6624, "num_input_tokens_seen": 1479840, "step": 4410 }, { "epoch": 3.411901081916538, "grad_norm": 1.6739932298660278, "learning_rate": 4.9245600578394654e-05, "loss": 0.5817, "num_input_tokens_seen": 1481216, "step": 4415 }, { "epoch": 3.4157650695517776, "grad_norm": 0.71927809715271, "learning_rate": 4.9241484538345887e-05, "loss": 0.6396, "num_input_tokens_seen": 1483424, "step": 4420 }, { "epoch": 3.419629057187017, "grad_norm": 0.5435962080955505, "learning_rate": 4.923735747312477e-05, "loss": 0.4864, "num_input_tokens_seen": 1485376, "step": 4425 }, { "epoch": 3.4234930448222567, "grad_norm": 0.48695939779281616, "learning_rate": 4.923321938460833e-05, "loss": 0.506, "num_input_tokens_seen": 1487136, "step": 4430 }, { "epoch": 3.427357032457496, "grad_norm": 0.7365334033966064, "learning_rate": 4.922907027467858e-05, "loss": 0.3845, "num_input_tokens_seen": 1488576, "step": 4435 }, { "epoch": 3.4312210200927358, "grad_norm": 0.8919357657432556, "learning_rate": 4.922491014522257e-05, "loss": 0.6218, "num_input_tokens_seen": 1490464, "step": 4440 }, { "epoch": 3.435085007727975, "grad_norm": 0.9008132815361023, "learning_rate": 4.922073899813235e-05, "loss": 0.4755, "num_input_tokens_seen": 1492224, "step": 4445 }, { "epoch": 3.438948995363215, "grad_norm": 0.8480945825576782, "learning_rate": 4.9216556835304975e-05, "loss": 0.4406, "num_input_tokens_seen": 1493728, "step": 4450 }, { "epoch": 3.442812982998454, "grad_norm": 0.40039923787117004, "learning_rate": 4.9212363658642536e-05, "loss": 0.5045, "num_input_tokens_seen": 1495392, "step": 4455 }, { "epoch": 3.446676970633694, "grad_norm": 0.4490639567375183, "learning_rate": 4.920815947005209e-05, "loss": 0.4741, "num_input_tokens_seen": 1497312, "step": 4460 }, { "epoch": 3.4505409582689337, "grad_norm": 0.7604079246520996, "learning_rate": 4.920394427144575e-05, "loss": 0.5096, "num_input_tokens_seen": 1499296, "step": 4465 }, { "epoch": 3.454404945904173, "grad_norm": 0.8587743043899536, "learning_rate": 4.91997180647406e-05, "loss": 0.5738, "num_input_tokens_seen": 1500832, "step": 4470 }, { "epoch": 3.458268933539413, "grad_norm": 0.9217371344566345, "learning_rate": 4.9195480851858743e-05, "loss": 0.5115, "num_input_tokens_seen": 1502816, "step": 4475 }, { "epoch": 3.462132921174652, "grad_norm": 0.6321684122085571, "learning_rate": 4.91912326347273e-05, "loss": 0.4262, "num_input_tokens_seen": 1504544, "step": 4480 }, { "epoch": 3.465996908809892, "grad_norm": 0.8140820860862732, "learning_rate": 4.9186973415278375e-05, "loss": 0.4701, "num_input_tokens_seen": 1506272, "step": 4485 }, { "epoch": 3.469860896445131, "grad_norm": 0.5849083662033081, "learning_rate": 4.918270319544909e-05, "loss": 0.4925, "num_input_tokens_seen": 1508032, "step": 4490 }, { "epoch": 3.473724884080371, "grad_norm": 0.7906475067138672, "learning_rate": 4.917842197718157e-05, "loss": 0.4691, "num_input_tokens_seen": 1509696, "step": 4495 }, { "epoch": 3.4775888717156107, "grad_norm": 0.6740891933441162, "learning_rate": 4.917412976242294e-05, "loss": 0.5558, "num_input_tokens_seen": 1511456, "step": 4500 }, { "epoch": 3.48145285935085, "grad_norm": 1.159609317779541, "learning_rate": 4.916982655312532e-05, "loss": 0.4649, "num_input_tokens_seen": 1513216, "step": 4505 }, { "epoch": 3.48531684698609, "grad_norm": 1.519842267036438, "learning_rate": 4.916551235124582e-05, "loss": 0.5442, "num_input_tokens_seen": 1514816, "step": 4510 }, { "epoch": 3.489180834621329, "grad_norm": 0.6051508188247681, "learning_rate": 4.91611871587466e-05, "loss": 0.4575, "num_input_tokens_seen": 1516480, "step": 4515 }, { "epoch": 3.493044822256569, "grad_norm": 0.6568185687065125, "learning_rate": 4.915685097759476e-05, "loss": 0.7259, "num_input_tokens_seen": 1518432, "step": 4520 }, { "epoch": 3.496908809891808, "grad_norm": 0.7474952936172485, "learning_rate": 4.915250380976242e-05, "loss": 0.6183, "num_input_tokens_seen": 1520192, "step": 4525 }, { "epoch": 3.500772797527048, "grad_norm": 0.8032436370849609, "learning_rate": 4.914814565722671e-05, "loss": 0.5219, "num_input_tokens_seen": 1521824, "step": 4530 }, { "epoch": 3.5046367851622877, "grad_norm": 0.7352713346481323, "learning_rate": 4.914377652196973e-05, "loss": 0.5133, "num_input_tokens_seen": 1523488, "step": 4535 }, { "epoch": 3.508500772797527, "grad_norm": 0.6375516653060913, "learning_rate": 4.9139396405978604e-05, "loss": 0.3952, "num_input_tokens_seen": 1525120, "step": 4540 }, { "epoch": 3.5123647604327664, "grad_norm": 0.9701124429702759, "learning_rate": 4.913500531124543e-05, "loss": 0.4511, "num_input_tokens_seen": 1527040, "step": 4545 }, { "epoch": 3.516228748068006, "grad_norm": 0.798828661441803, "learning_rate": 4.9130603239767294e-05, "loss": 0.4197, "num_input_tokens_seen": 1528576, "step": 4550 }, { "epoch": 3.520092735703246, "grad_norm": 0.9932349324226379, "learning_rate": 4.912619019354629e-05, "loss": 0.6366, "num_input_tokens_seen": 1530528, "step": 4555 }, { "epoch": 3.523956723338485, "grad_norm": 1.1112257242202759, "learning_rate": 4.912176617458951e-05, "loss": 0.4621, "num_input_tokens_seen": 1532064, "step": 4560 }, { "epoch": 3.527820710973725, "grad_norm": 1.027362585067749, "learning_rate": 4.911733118490901e-05, "loss": 0.5504, "num_input_tokens_seen": 1533856, "step": 4565 }, { "epoch": 3.5316846986089647, "grad_norm": 0.604669451713562, "learning_rate": 4.9112885226521846e-05, "loss": 0.5694, "num_input_tokens_seen": 1535552, "step": 4570 }, { "epoch": 3.535548686244204, "grad_norm": 1.1075372695922852, "learning_rate": 4.9108428301450084e-05, "loss": 0.5434, "num_input_tokens_seen": 1537216, "step": 4575 }, { "epoch": 3.5394126738794434, "grad_norm": 0.9460054636001587, "learning_rate": 4.9103960411720754e-05, "loss": 0.5548, "num_input_tokens_seen": 1538976, "step": 4580 }, { "epoch": 3.543276661514683, "grad_norm": 1.105589509010315, "learning_rate": 4.909948155936587e-05, "loss": 0.4583, "num_input_tokens_seen": 1540608, "step": 4585 }, { "epoch": 3.547140649149923, "grad_norm": 0.7472132444381714, "learning_rate": 4.9094991746422434e-05, "loss": 0.5196, "num_input_tokens_seen": 1542176, "step": 4590 }, { "epoch": 3.551004636785162, "grad_norm": 1.3863698244094849, "learning_rate": 4.909049097493247e-05, "loss": 0.7, "num_input_tokens_seen": 1544000, "step": 4595 }, { "epoch": 3.554868624420402, "grad_norm": 0.6708266735076904, "learning_rate": 4.9085979246942935e-05, "loss": 0.428, "num_input_tokens_seen": 1545856, "step": 4600 }, { "epoch": 3.5587326120556413, "grad_norm": 0.5889275670051575, "learning_rate": 4.908145656450579e-05, "loss": 0.4008, "num_input_tokens_seen": 1547648, "step": 4605 }, { "epoch": 3.562596599690881, "grad_norm": 0.9158351421356201, "learning_rate": 4.9076922929677984e-05, "loss": 0.6275, "num_input_tokens_seen": 1549440, "step": 4610 }, { "epoch": 3.5664605873261204, "grad_norm": 1.1099928617477417, "learning_rate": 4.907237834452144e-05, "loss": 0.7519, "num_input_tokens_seen": 1551264, "step": 4615 }, { "epoch": 3.57032457496136, "grad_norm": 0.7160236835479736, "learning_rate": 4.9067822811103055e-05, "loss": 0.6154, "num_input_tokens_seen": 1552928, "step": 4620 }, { "epoch": 3.5741885625966, "grad_norm": 0.9749694466590881, "learning_rate": 4.906325633149472e-05, "loss": 0.3945, "num_input_tokens_seen": 1554720, "step": 4625 }, { "epoch": 3.578052550231839, "grad_norm": 0.6481962203979492, "learning_rate": 4.9058678907773305e-05, "loss": 0.5079, "num_input_tokens_seen": 1556480, "step": 4630 }, { "epoch": 3.581916537867079, "grad_norm": 0.6042677164077759, "learning_rate": 4.905409054202063e-05, "loss": 0.5004, "num_input_tokens_seen": 1558208, "step": 4635 }, { "epoch": 3.5857805255023183, "grad_norm": 0.6644335389137268, "learning_rate": 4.904949123632353e-05, "loss": 0.4447, "num_input_tokens_seen": 1559776, "step": 4640 }, { "epoch": 3.589644513137558, "grad_norm": 0.6791064143180847, "learning_rate": 4.9044880992773776e-05, "loss": 0.4418, "num_input_tokens_seen": 1561184, "step": 4645 }, { "epoch": 3.5935085007727974, "grad_norm": 0.996258020401001, "learning_rate": 4.904025981346816e-05, "loss": 0.6372, "num_input_tokens_seen": 1562880, "step": 4650 }, { "epoch": 3.597372488408037, "grad_norm": 0.5431922674179077, "learning_rate": 4.903562770050841e-05, "loss": 0.6035, "num_input_tokens_seen": 1564576, "step": 4655 }, { "epoch": 3.601236476043277, "grad_norm": 0.9417484998703003, "learning_rate": 4.9030984656001236e-05, "loss": 0.6007, "num_input_tokens_seen": 1566272, "step": 4660 }, { "epoch": 3.605100463678516, "grad_norm": 1.3046990633010864, "learning_rate": 4.9026330682058316e-05, "loss": 0.6429, "num_input_tokens_seen": 1567968, "step": 4665 }, { "epoch": 3.6089644513137555, "grad_norm": 0.5430051684379578, "learning_rate": 4.902166578079633e-05, "loss": 0.5736, "num_input_tokens_seen": 1569664, "step": 4670 }, { "epoch": 3.6128284389489953, "grad_norm": 1.3074524402618408, "learning_rate": 4.9016989954336875e-05, "loss": 0.5351, "num_input_tokens_seen": 1571296, "step": 4675 }, { "epoch": 3.616692426584235, "grad_norm": 0.880635142326355, "learning_rate": 4.9012303204806556e-05, "loss": 1.1177, "num_input_tokens_seen": 1573024, "step": 4680 }, { "epoch": 3.6205564142194744, "grad_norm": 0.6943720579147339, "learning_rate": 4.900760553433694e-05, "loss": 0.4369, "num_input_tokens_seen": 1574656, "step": 4685 }, { "epoch": 3.624420401854714, "grad_norm": 0.963611364364624, "learning_rate": 4.900289694506455e-05, "loss": 0.7756, "num_input_tokens_seen": 1576192, "step": 4690 }, { "epoch": 3.628284389489954, "grad_norm": 0.5780571103096008, "learning_rate": 4.899817743913088e-05, "loss": 0.4514, "num_input_tokens_seen": 1578048, "step": 4695 }, { "epoch": 3.6321483771251932, "grad_norm": 0.7227694988250732, "learning_rate": 4.8993447018682395e-05, "loss": 0.479, "num_input_tokens_seen": 1579744, "step": 4700 }, { "epoch": 3.6360123647604325, "grad_norm": 0.6897138357162476, "learning_rate": 4.898870568587051e-05, "loss": 0.4087, "num_input_tokens_seen": 1581248, "step": 4705 }, { "epoch": 3.6398763523956723, "grad_norm": 0.6709452867507935, "learning_rate": 4.898395344285162e-05, "loss": 0.47, "num_input_tokens_seen": 1583200, "step": 4710 }, { "epoch": 3.643740340030912, "grad_norm": 0.7167448997497559, "learning_rate": 4.897919029178707e-05, "loss": 0.4885, "num_input_tokens_seen": 1584640, "step": 4715 }, { "epoch": 3.6476043276661514, "grad_norm": 0.7377757430076599, "learning_rate": 4.8974416234843165e-05, "loss": 0.5518, "num_input_tokens_seen": 1586496, "step": 4720 }, { "epoch": 3.651468315301391, "grad_norm": 0.6590046882629395, "learning_rate": 4.896963127419118e-05, "loss": 0.5841, "num_input_tokens_seen": 1588288, "step": 4725 }, { "epoch": 3.6553323029366305, "grad_norm": 0.7449619174003601, "learning_rate": 4.896483541200735e-05, "loss": 0.5772, "num_input_tokens_seen": 1589888, "step": 4730 }, { "epoch": 3.6591962905718702, "grad_norm": 0.9077023267745972, "learning_rate": 4.896002865047285e-05, "loss": 0.5533, "num_input_tokens_seen": 1591552, "step": 4735 }, { "epoch": 3.6630602782071096, "grad_norm": 0.7307506799697876, "learning_rate": 4.8955210991773825e-05, "loss": 0.4955, "num_input_tokens_seen": 1593408, "step": 4740 }, { "epoch": 3.6669242658423493, "grad_norm": 0.7190829515457153, "learning_rate": 4.895038243810138e-05, "loss": 0.5162, "num_input_tokens_seen": 1595072, "step": 4745 }, { "epoch": 3.670788253477589, "grad_norm": 0.6743956208229065, "learning_rate": 4.8945542991651574e-05, "loss": 0.4194, "num_input_tokens_seen": 1596672, "step": 4750 }, { "epoch": 3.6746522411128284, "grad_norm": 0.6587114334106445, "learning_rate": 4.894069265462542e-05, "loss": 0.4897, "num_input_tokens_seen": 1598656, "step": 4755 }, { "epoch": 3.678516228748068, "grad_norm": 0.48507463932037354, "learning_rate": 4.893583142922885e-05, "loss": 0.5253, "num_input_tokens_seen": 1600192, "step": 4760 }, { "epoch": 3.6823802163833075, "grad_norm": 1.8430289030075073, "learning_rate": 4.893095931767281e-05, "loss": 0.5517, "num_input_tokens_seen": 1602080, "step": 4765 }, { "epoch": 3.6862442040185472, "grad_norm": 0.4905112385749817, "learning_rate": 4.8926076322173156e-05, "loss": 0.5808, "num_input_tokens_seen": 1603840, "step": 4770 }, { "epoch": 3.6901081916537866, "grad_norm": 0.6916890144348145, "learning_rate": 4.892118244495071e-05, "loss": 0.7492, "num_input_tokens_seen": 1605696, "step": 4775 }, { "epoch": 3.6939721792890263, "grad_norm": 0.6606625914573669, "learning_rate": 4.891627768823122e-05, "loss": 0.4661, "num_input_tokens_seen": 1607360, "step": 4780 }, { "epoch": 3.697836166924266, "grad_norm": 0.6435779333114624, "learning_rate": 4.8911362054245416e-05, "loss": 0.5005, "num_input_tokens_seen": 1609120, "step": 4785 }, { "epoch": 3.7017001545595054, "grad_norm": 0.9034379124641418, "learning_rate": 4.890643554522894e-05, "loss": 0.6733, "num_input_tokens_seen": 1610848, "step": 4790 }, { "epoch": 3.7055641421947447, "grad_norm": 0.6058748364448547, "learning_rate": 4.890149816342241e-05, "loss": 0.4758, "num_input_tokens_seen": 1612352, "step": 4795 }, { "epoch": 3.7094281298299845, "grad_norm": 0.4736906886100769, "learning_rate": 4.889654991107138e-05, "loss": 0.4899, "num_input_tokens_seen": 1614272, "step": 4800 }, { "epoch": 3.7132921174652243, "grad_norm": 0.6378251314163208, "learning_rate": 4.889159079042634e-05, "loss": 0.5188, "num_input_tokens_seen": 1616064, "step": 4805 }, { "epoch": 3.7171561051004636, "grad_norm": 0.719385027885437, "learning_rate": 4.888662080374272e-05, "loss": 0.7706, "num_input_tokens_seen": 1617696, "step": 4810 }, { "epoch": 3.7210200927357033, "grad_norm": 0.9703797698020935, "learning_rate": 4.8881639953280914e-05, "loss": 0.5382, "num_input_tokens_seen": 1619552, "step": 4815 }, { "epoch": 3.7248840803709427, "grad_norm": 0.7486345171928406, "learning_rate": 4.887664824130623e-05, "loss": 0.4654, "num_input_tokens_seen": 1621056, "step": 4820 }, { "epoch": 3.7287480680061824, "grad_norm": 0.549968957901001, "learning_rate": 4.887164567008894e-05, "loss": 0.5836, "num_input_tokens_seen": 1622624, "step": 4825 }, { "epoch": 3.7326120556414217, "grad_norm": 0.8464016914367676, "learning_rate": 4.886663224190424e-05, "loss": 0.5231, "num_input_tokens_seen": 1624544, "step": 4830 }, { "epoch": 3.7364760432766615, "grad_norm": 0.9920933842658997, "learning_rate": 4.886160795903226e-05, "loss": 1.0181, "num_input_tokens_seen": 1626208, "step": 4835 }, { "epoch": 3.7403400309119013, "grad_norm": 0.7169435620307922, "learning_rate": 4.885657282375808e-05, "loss": 0.6508, "num_input_tokens_seen": 1627776, "step": 4840 }, { "epoch": 3.7442040185471406, "grad_norm": 0.7069045901298523, "learning_rate": 4.8851526838371706e-05, "loss": 0.4471, "num_input_tokens_seen": 1629440, "step": 4845 }, { "epoch": 3.7480680061823803, "grad_norm": 0.8062605261802673, "learning_rate": 4.8846470005168085e-05, "loss": 0.3766, "num_input_tokens_seen": 1631072, "step": 4850 }, { "epoch": 3.7519319938176197, "grad_norm": 1.049372673034668, "learning_rate": 4.8841402326447096e-05, "loss": 0.6921, "num_input_tokens_seen": 1632640, "step": 4855 }, { "epoch": 3.7557959814528594, "grad_norm": 1.4616609811782837, "learning_rate": 4.883632380451355e-05, "loss": 0.5581, "num_input_tokens_seen": 1634336, "step": 4860 }, { "epoch": 3.7596599690880987, "grad_norm": 0.6981266140937805, "learning_rate": 4.8831234441677186e-05, "loss": 0.4161, "num_input_tokens_seen": 1636128, "step": 4865 }, { "epoch": 3.7635239567233385, "grad_norm": 0.5820719003677368, "learning_rate": 4.882613424025267e-05, "loss": 0.4105, "num_input_tokens_seen": 1637888, "step": 4870 }, { "epoch": 3.7673879443585783, "grad_norm": 0.4576059579849243, "learning_rate": 4.8821023202559624e-05, "loss": 0.3875, "num_input_tokens_seen": 1639392, "step": 4875 }, { "epoch": 3.7712519319938176, "grad_norm": 0.8029463887214661, "learning_rate": 4.881590133092256e-05, "loss": 0.4568, "num_input_tokens_seen": 1641056, "step": 4880 }, { "epoch": 3.7751159196290573, "grad_norm": 0.6097744703292847, "learning_rate": 4.881076862767095e-05, "loss": 0.4503, "num_input_tokens_seen": 1642816, "step": 4885 }, { "epoch": 3.7789799072642967, "grad_norm": 0.749586820602417, "learning_rate": 4.880562509513917e-05, "loss": 0.4465, "num_input_tokens_seen": 1644384, "step": 4890 }, { "epoch": 3.7828438948995364, "grad_norm": 0.8426031470298767, "learning_rate": 4.8800470735666525e-05, "loss": 0.4804, "num_input_tokens_seen": 1646240, "step": 4895 }, { "epoch": 3.7867078825347757, "grad_norm": 0.9515178799629211, "learning_rate": 4.879530555159726e-05, "loss": 0.7863, "num_input_tokens_seen": 1648320, "step": 4900 }, { "epoch": 3.7905718701700155, "grad_norm": 0.5333658456802368, "learning_rate": 4.8790129545280514e-05, "loss": 0.3838, "num_input_tokens_seen": 1650048, "step": 4905 }, { "epoch": 3.7944358578052553, "grad_norm": 0.8448347449302673, "learning_rate": 4.8784942719070395e-05, "loss": 0.7681, "num_input_tokens_seen": 1651552, "step": 4910 }, { "epoch": 3.7982998454404946, "grad_norm": 0.7309392690658569, "learning_rate": 4.8779745075325874e-05, "loss": 0.5124, "num_input_tokens_seen": 1653344, "step": 4915 }, { "epoch": 3.802163833075734, "grad_norm": 0.45058542490005493, "learning_rate": 4.8774536616410884e-05, "loss": 0.8066, "num_input_tokens_seen": 1655264, "step": 4920 }, { "epoch": 3.8060278207109737, "grad_norm": 1.146256446838379, "learning_rate": 4.876931734469425e-05, "loss": 0.4712, "num_input_tokens_seen": 1656928, "step": 4925 }, { "epoch": 3.8098918083462134, "grad_norm": 1.4264575242996216, "learning_rate": 4.876408726254975e-05, "loss": 0.6735, "num_input_tokens_seen": 1658496, "step": 4930 }, { "epoch": 3.8137557959814528, "grad_norm": 0.9846211671829224, "learning_rate": 4.875884637235605e-05, "loss": 0.7163, "num_input_tokens_seen": 1660096, "step": 4935 }, { "epoch": 3.8176197836166925, "grad_norm": 0.6459515690803528, "learning_rate": 4.8753594676496725e-05, "loss": 0.4125, "num_input_tokens_seen": 1661696, "step": 4940 }, { "epoch": 3.821483771251932, "grad_norm": 1.2701416015625, "learning_rate": 4.874833217736029e-05, "loss": 0.6387, "num_input_tokens_seen": 1663264, "step": 4945 }, { "epoch": 3.8253477588871716, "grad_norm": 0.9865385293960571, "learning_rate": 4.874305887734016e-05, "loss": 0.4442, "num_input_tokens_seen": 1664768, "step": 4950 }, { "epoch": 3.829211746522411, "grad_norm": 0.7817051410675049, "learning_rate": 4.8737774778834654e-05, "loss": 0.6107, "num_input_tokens_seen": 1666272, "step": 4955 }, { "epoch": 3.8330757341576507, "grad_norm": 0.9309498071670532, "learning_rate": 4.8732479884247025e-05, "loss": 0.5, "num_input_tokens_seen": 1667904, "step": 4960 }, { "epoch": 3.8369397217928904, "grad_norm": 0.7092393040657043, "learning_rate": 4.872717419598541e-05, "loss": 0.4905, "num_input_tokens_seen": 1669344, "step": 4965 }, { "epoch": 3.8408037094281298, "grad_norm": 1.2983274459838867, "learning_rate": 4.872185771646288e-05, "loss": 0.5425, "num_input_tokens_seen": 1671296, "step": 4970 }, { "epoch": 3.8446676970633695, "grad_norm": 0.8134065270423889, "learning_rate": 4.87165304480974e-05, "loss": 0.5071, "num_input_tokens_seen": 1673024, "step": 4975 }, { "epoch": 3.848531684698609, "grad_norm": 0.7968652844429016, "learning_rate": 4.871119239331183e-05, "loss": 0.7112, "num_input_tokens_seen": 1674624, "step": 4980 }, { "epoch": 3.8523956723338486, "grad_norm": 0.8783146142959595, "learning_rate": 4.870584355453396e-05, "loss": 0.5641, "num_input_tokens_seen": 1676352, "step": 4985 }, { "epoch": 3.856259659969088, "grad_norm": 0.8903418779373169, "learning_rate": 4.870048393419647e-05, "loss": 0.7057, "num_input_tokens_seen": 1678048, "step": 4990 }, { "epoch": 3.8601236476043277, "grad_norm": 0.5092277526855469, "learning_rate": 4.869511353473696e-05, "loss": 0.4434, "num_input_tokens_seen": 1679584, "step": 4995 }, { "epoch": 3.8639876352395675, "grad_norm": 0.4562576711177826, "learning_rate": 4.868973235859791e-05, "loss": 0.4649, "num_input_tokens_seen": 1681312, "step": 5000 }, { "epoch": 3.8678516228748068, "grad_norm": 0.5812659859657288, "learning_rate": 4.8684340408226696e-05, "loss": 0.4321, "num_input_tokens_seen": 1682912, "step": 5005 }, { "epoch": 3.871715610510046, "grad_norm": 0.5146694183349609, "learning_rate": 4.867893768607564e-05, "loss": 0.5735, "num_input_tokens_seen": 1684640, "step": 5010 }, { "epoch": 3.875579598145286, "grad_norm": 0.5959664583206177, "learning_rate": 4.86735241946019e-05, "loss": 0.4978, "num_input_tokens_seen": 1686336, "step": 5015 }, { "epoch": 3.8794435857805256, "grad_norm": 1.6409862041473389, "learning_rate": 4.86680999362676e-05, "loss": 0.733, "num_input_tokens_seen": 1688128, "step": 5020 }, { "epoch": 3.883307573415765, "grad_norm": 0.7823237180709839, "learning_rate": 4.86626649135397e-05, "loss": 0.4391, "num_input_tokens_seen": 1689856, "step": 5025 }, { "epoch": 3.8871715610510047, "grad_norm": 0.6820517778396606, "learning_rate": 4.865721912889009e-05, "loss": 0.8157, "num_input_tokens_seen": 1691584, "step": 5030 }, { "epoch": 3.8910355486862445, "grad_norm": 0.6110095381736755, "learning_rate": 4.8651762584795535e-05, "loss": 0.4393, "num_input_tokens_seen": 1693056, "step": 5035 }, { "epoch": 3.894899536321484, "grad_norm": 0.6221805810928345, "learning_rate": 4.864629528373771e-05, "loss": 0.4858, "num_input_tokens_seen": 1694496, "step": 5040 }, { "epoch": 3.898763523956723, "grad_norm": 1.1552883386611938, "learning_rate": 4.864081722820318e-05, "loss": 0.6296, "num_input_tokens_seen": 1696256, "step": 5045 }, { "epoch": 3.902627511591963, "grad_norm": 0.5828110575675964, "learning_rate": 4.86353284206834e-05, "loss": 0.4761, "num_input_tokens_seen": 1698272, "step": 5050 }, { "epoch": 3.9064914992272026, "grad_norm": 1.6864818334579468, "learning_rate": 4.862982886367471e-05, "loss": 0.563, "num_input_tokens_seen": 1700064, "step": 5055 }, { "epoch": 3.910355486862442, "grad_norm": 0.5700066685676575, "learning_rate": 4.862431855967833e-05, "loss": 0.4203, "num_input_tokens_seen": 1701504, "step": 5060 }, { "epoch": 3.9142194744976817, "grad_norm": 0.6252669095993042, "learning_rate": 4.86187975112004e-05, "loss": 0.6447, "num_input_tokens_seen": 1703552, "step": 5065 }, { "epoch": 3.918083462132921, "grad_norm": 0.689044177532196, "learning_rate": 4.8613265720751904e-05, "loss": 0.58, "num_input_tokens_seen": 1705472, "step": 5070 }, { "epoch": 3.921947449768161, "grad_norm": 0.9951134920120239, "learning_rate": 4.860772319084875e-05, "loss": 0.5662, "num_input_tokens_seen": 1707296, "step": 5075 }, { "epoch": 3.9258114374034, "grad_norm": 0.8086551427841187, "learning_rate": 4.8602169924011703e-05, "loss": 0.545, "num_input_tokens_seen": 1709024, "step": 5080 }, { "epoch": 3.92967542503864, "grad_norm": 1.02944815158844, "learning_rate": 4.859660592276643e-05, "loss": 0.8018, "num_input_tokens_seen": 1710752, "step": 5085 }, { "epoch": 3.9335394126738796, "grad_norm": 0.6812861561775208, "learning_rate": 4.859103118964347e-05, "loss": 0.4549, "num_input_tokens_seen": 1712320, "step": 5090 }, { "epoch": 3.937403400309119, "grad_norm": 1.6059383153915405, "learning_rate": 4.858544572717824e-05, "loss": 0.7325, "num_input_tokens_seen": 1714112, "step": 5095 }, { "epoch": 3.9412673879443587, "grad_norm": 0.7533444762229919, "learning_rate": 4.857984953791105e-05, "loss": 0.7163, "num_input_tokens_seen": 1715744, "step": 5100 }, { "epoch": 3.945131375579598, "grad_norm": 0.6390364170074463, "learning_rate": 4.8574242624387066e-05, "loss": 0.4851, "num_input_tokens_seen": 1717280, "step": 5105 }, { "epoch": 3.948995363214838, "grad_norm": 0.8336163759231567, "learning_rate": 4.856862498915637e-05, "loss": 0.6463, "num_input_tokens_seen": 1718944, "step": 5110 }, { "epoch": 3.952859350850077, "grad_norm": 1.2860265970230103, "learning_rate": 4.8562996634773875e-05, "loss": 0.5838, "num_input_tokens_seen": 1720736, "step": 5115 }, { "epoch": 3.956723338485317, "grad_norm": 0.7122815847396851, "learning_rate": 4.85573575637994e-05, "loss": 0.3917, "num_input_tokens_seen": 1722144, "step": 5120 }, { "epoch": 3.9605873261205566, "grad_norm": 0.6224213242530823, "learning_rate": 4.855170777879762e-05, "loss": 0.547, "num_input_tokens_seen": 1723936, "step": 5125 }, { "epoch": 3.964451313755796, "grad_norm": 1.6355857849121094, "learning_rate": 4.8546047282338105e-05, "loss": 0.4792, "num_input_tokens_seen": 1725408, "step": 5130 }, { "epoch": 3.9683153013910353, "grad_norm": 0.8131416440010071, "learning_rate": 4.854037607699526e-05, "loss": 0.4443, "num_input_tokens_seen": 1726848, "step": 5135 }, { "epoch": 3.972179289026275, "grad_norm": 0.7005437016487122, "learning_rate": 4.853469416534841e-05, "loss": 0.4449, "num_input_tokens_seen": 1728256, "step": 5140 }, { "epoch": 3.976043276661515, "grad_norm": 0.5942810773849487, "learning_rate": 4.85290015499817e-05, "loss": 0.4034, "num_input_tokens_seen": 1730080, "step": 5145 }, { "epoch": 3.979907264296754, "grad_norm": 0.4472332000732422, "learning_rate": 4.852329823348419e-05, "loss": 0.4474, "num_input_tokens_seen": 1732096, "step": 5150 }, { "epoch": 3.983771251931994, "grad_norm": 0.672971785068512, "learning_rate": 4.851758421844976e-05, "loss": 1.0605, "num_input_tokens_seen": 1733920, "step": 5155 }, { "epoch": 3.9876352395672336, "grad_norm": 0.42949873208999634, "learning_rate": 4.8511859507477185e-05, "loss": 0.7982, "num_input_tokens_seen": 1735712, "step": 5160 }, { "epoch": 3.991499227202473, "grad_norm": 0.5437090396881104, "learning_rate": 4.8506124103170096e-05, "loss": 0.4076, "num_input_tokens_seen": 1737120, "step": 5165 }, { "epoch": 3.9953632148377123, "grad_norm": 0.7216773629188538, "learning_rate": 4.850037800813699e-05, "loss": 0.4754, "num_input_tokens_seen": 1738752, "step": 5170 }, { "epoch": 3.999227202472952, "grad_norm": 0.8288756012916565, "learning_rate": 4.849462122499124e-05, "loss": 0.584, "num_input_tokens_seen": 1740288, "step": 5175 }, { "epoch": 4.0, "eval_loss": 0.5370374321937561, "eval_runtime": 6.3589, "eval_samples_per_second": 90.424, "eval_steps_per_second": 22.645, "num_input_tokens_seen": 1740416, "step": 5176 }, { "epoch": 4.003091190108192, "grad_norm": 0.8334658741950989, "learning_rate": 4.848885375635105e-05, "loss": 0.5533, "num_input_tokens_seen": 1741696, "step": 5180 }, { "epoch": 4.006955177743431, "grad_norm": 0.5427860021591187, "learning_rate": 4.84830756048395e-05, "loss": 0.5486, "num_input_tokens_seen": 1743680, "step": 5185 }, { "epoch": 4.0108191653786704, "grad_norm": 0.9039366841316223, "learning_rate": 4.847728677308453e-05, "loss": 0.6552, "num_input_tokens_seen": 1745056, "step": 5190 }, { "epoch": 4.014683153013911, "grad_norm": 0.6624770760536194, "learning_rate": 4.847148726371893e-05, "loss": 0.4262, "num_input_tokens_seen": 1746624, "step": 5195 }, { "epoch": 4.01854714064915, "grad_norm": 0.7394809722900391, "learning_rate": 4.846567707938036e-05, "loss": 0.476, "num_input_tokens_seen": 1748384, "step": 5200 }, { "epoch": 4.022411128284389, "grad_norm": 0.5269570350646973, "learning_rate": 4.845985622271133e-05, "loss": 0.4686, "num_input_tokens_seen": 1749824, "step": 5205 }, { "epoch": 4.0262751159196295, "grad_norm": 0.5425906181335449, "learning_rate": 4.845402469635919e-05, "loss": 0.5257, "num_input_tokens_seen": 1751648, "step": 5210 }, { "epoch": 4.030139103554869, "grad_norm": 0.7693442702293396, "learning_rate": 4.844818250297616e-05, "loss": 0.7544, "num_input_tokens_seen": 1753344, "step": 5215 }, { "epoch": 4.034003091190108, "grad_norm": 0.9598110318183899, "learning_rate": 4.84423296452193e-05, "loss": 0.7841, "num_input_tokens_seen": 1755072, "step": 5220 }, { "epoch": 4.0378670788253475, "grad_norm": 0.947083592414856, "learning_rate": 4.843646612575052e-05, "loss": 0.4761, "num_input_tokens_seen": 1756608, "step": 5225 }, { "epoch": 4.041731066460588, "grad_norm": 0.9186453819274902, "learning_rate": 4.8430591947236605e-05, "loss": 0.406, "num_input_tokens_seen": 1758048, "step": 5230 }, { "epoch": 4.045595054095827, "grad_norm": 0.9180390238761902, "learning_rate": 4.842470711234914e-05, "loss": 0.4628, "num_input_tokens_seen": 1759520, "step": 5235 }, { "epoch": 4.049459041731066, "grad_norm": 0.5713527798652649, "learning_rate": 4.84188116237646e-05, "loss": 0.6737, "num_input_tokens_seen": 1761312, "step": 5240 }, { "epoch": 4.053323029366306, "grad_norm": 0.9484896063804626, "learning_rate": 4.841290548416428e-05, "loss": 0.533, "num_input_tokens_seen": 1763104, "step": 5245 }, { "epoch": 4.057187017001546, "grad_norm": 0.7043066620826721, "learning_rate": 4.8406988696234336e-05, "loss": 0.5093, "num_input_tokens_seen": 1765088, "step": 5250 }, { "epoch": 4.061051004636785, "grad_norm": 0.5255547761917114, "learning_rate": 4.840106126266575e-05, "loss": 0.4141, "num_input_tokens_seen": 1767040, "step": 5255 }, { "epoch": 4.0649149922720245, "grad_norm": 1.1005775928497314, "learning_rate": 4.8395123186154365e-05, "loss": 0.6566, "num_input_tokens_seen": 1768864, "step": 5260 }, { "epoch": 4.068778979907265, "grad_norm": 0.8517866730690002, "learning_rate": 4.838917446940084e-05, "loss": 0.4399, "num_input_tokens_seen": 1770624, "step": 5265 }, { "epoch": 4.072642967542504, "grad_norm": 0.6226833462715149, "learning_rate": 4.83832151151107e-05, "loss": 0.5862, "num_input_tokens_seen": 1772032, "step": 5270 }, { "epoch": 4.076506955177743, "grad_norm": 0.8774064779281616, "learning_rate": 4.8377245125994285e-05, "loss": 0.5946, "num_input_tokens_seen": 1773600, "step": 5275 }, { "epoch": 4.080370942812983, "grad_norm": 1.070698857307434, "learning_rate": 4.837126450476679e-05, "loss": 0.6984, "num_input_tokens_seen": 1775040, "step": 5280 }, { "epoch": 4.084234930448223, "grad_norm": 1.0552465915679932, "learning_rate": 4.8365273254148226e-05, "loss": 0.6504, "num_input_tokens_seen": 1776800, "step": 5285 }, { "epoch": 4.088098918083462, "grad_norm": 0.7210323810577393, "learning_rate": 4.8359271376863454e-05, "loss": 0.4201, "num_input_tokens_seen": 1778496, "step": 5290 }, { "epoch": 4.0919629057187015, "grad_norm": 0.8645302057266235, "learning_rate": 4.8353258875642185e-05, "loss": 0.4705, "num_input_tokens_seen": 1780064, "step": 5295 }, { "epoch": 4.095826893353942, "grad_norm": 0.60664963722229, "learning_rate": 4.8347235753218904e-05, "loss": 0.4491, "num_input_tokens_seen": 1781984, "step": 5300 }, { "epoch": 4.099690880989181, "grad_norm": 0.8257946372032166, "learning_rate": 4.8341202012332985e-05, "loss": 0.5652, "num_input_tokens_seen": 1783904, "step": 5305 }, { "epoch": 4.10355486862442, "grad_norm": 0.8756449818611145, "learning_rate": 4.833515765572861e-05, "loss": 0.4772, "num_input_tokens_seen": 1785600, "step": 5310 }, { "epoch": 4.10741885625966, "grad_norm": 0.7186408638954163, "learning_rate": 4.8329102686154784e-05, "loss": 0.6592, "num_input_tokens_seen": 1787072, "step": 5315 }, { "epoch": 4.1112828438949, "grad_norm": 1.1080188751220703, "learning_rate": 4.832303710636535e-05, "loss": 0.4396, "num_input_tokens_seen": 1788736, "step": 5320 }, { "epoch": 4.115146831530139, "grad_norm": 0.5735934972763062, "learning_rate": 4.831696091911895e-05, "loss": 0.3927, "num_input_tokens_seen": 1790208, "step": 5325 }, { "epoch": 4.1190108191653785, "grad_norm": 0.9489997625350952, "learning_rate": 4.83108741271791e-05, "loss": 0.6526, "num_input_tokens_seen": 1792256, "step": 5330 }, { "epoch": 4.122874806800619, "grad_norm": 0.8139804601669312, "learning_rate": 4.8304776733314085e-05, "loss": 0.4697, "num_input_tokens_seen": 1793888, "step": 5335 }, { "epoch": 4.126738794435858, "grad_norm": 0.6519821286201477, "learning_rate": 4.8298668740297036e-05, "loss": 0.4455, "num_input_tokens_seen": 1795488, "step": 5340 }, { "epoch": 4.130602782071097, "grad_norm": 0.602120578289032, "learning_rate": 4.829255015090592e-05, "loss": 0.7042, "num_input_tokens_seen": 1797376, "step": 5345 }, { "epoch": 4.134466769706337, "grad_norm": 0.7349780201911926, "learning_rate": 4.828642096792351e-05, "loss": 0.5315, "num_input_tokens_seen": 1799008, "step": 5350 }, { "epoch": 4.138330757341577, "grad_norm": 0.8087452054023743, "learning_rate": 4.828028119413738e-05, "loss": 0.5505, "num_input_tokens_seen": 1800544, "step": 5355 }, { "epoch": 4.142194744976816, "grad_norm": 1.8923954963684082, "learning_rate": 4.827413083233995e-05, "loss": 0.486, "num_input_tokens_seen": 1802016, "step": 5360 }, { "epoch": 4.1460587326120555, "grad_norm": 1.0585122108459473, "learning_rate": 4.8267969885328426e-05, "loss": 0.4591, "num_input_tokens_seen": 1803680, "step": 5365 }, { "epoch": 4.149922720247295, "grad_norm": 0.9581099152565002, "learning_rate": 4.8261798355904854e-05, "loss": 0.7485, "num_input_tokens_seen": 1805568, "step": 5370 }, { "epoch": 4.153786707882535, "grad_norm": 0.8446477651596069, "learning_rate": 4.825561624687608e-05, "loss": 0.4709, "num_input_tokens_seen": 1807296, "step": 5375 }, { "epoch": 4.157650695517774, "grad_norm": 0.5415929555892944, "learning_rate": 4.824942356105376e-05, "loss": 0.6321, "num_input_tokens_seen": 1808896, "step": 5380 }, { "epoch": 4.161514683153014, "grad_norm": 0.49888476729393005, "learning_rate": 4.8243220301254377e-05, "loss": 0.4708, "num_input_tokens_seen": 1810688, "step": 5385 }, { "epoch": 4.165378670788254, "grad_norm": 1.7250618934631348, "learning_rate": 4.8237006470299197e-05, "loss": 0.579, "num_input_tokens_seen": 1812384, "step": 5390 }, { "epoch": 4.169242658423493, "grad_norm": 0.5572034120559692, "learning_rate": 4.823078207101431e-05, "loss": 0.7611, "num_input_tokens_seen": 1814176, "step": 5395 }, { "epoch": 4.1731066460587325, "grad_norm": 0.6535527110099792, "learning_rate": 4.8224547106230624e-05, "loss": 0.4548, "num_input_tokens_seen": 1815776, "step": 5400 }, { "epoch": 4.176970633693972, "grad_norm": 0.6978468894958496, "learning_rate": 4.821830157878382e-05, "loss": 0.4495, "num_input_tokens_seen": 1817664, "step": 5405 }, { "epoch": 4.180834621329212, "grad_norm": 1.2637455463409424, "learning_rate": 4.821204549151441e-05, "loss": 0.6184, "num_input_tokens_seen": 1819168, "step": 5410 }, { "epoch": 4.184698608964451, "grad_norm": 1.064057469367981, "learning_rate": 4.82057788472677e-05, "loss": 0.7679, "num_input_tokens_seen": 1821120, "step": 5415 }, { "epoch": 4.188562596599691, "grad_norm": 1.0898927450180054, "learning_rate": 4.81995016488938e-05, "loss": 0.4309, "num_input_tokens_seen": 1822976, "step": 5420 }, { "epoch": 4.192426584234931, "grad_norm": 0.7687124013900757, "learning_rate": 4.8193213899247616e-05, "loss": 0.6445, "num_input_tokens_seen": 1824576, "step": 5425 }, { "epoch": 4.19629057187017, "grad_norm": 0.8497228622436523, "learning_rate": 4.818691560118884e-05, "loss": 0.3935, "num_input_tokens_seen": 1826176, "step": 5430 }, { "epoch": 4.2001545595054095, "grad_norm": 1.427596926689148, "learning_rate": 4.8180606757582e-05, "loss": 0.7213, "num_input_tokens_seen": 1827776, "step": 5435 }, { "epoch": 4.204018547140649, "grad_norm": 0.7309900522232056, "learning_rate": 4.817428737129638e-05, "loss": 0.404, "num_input_tokens_seen": 1829504, "step": 5440 }, { "epoch": 4.207882534775889, "grad_norm": 0.6353925466537476, "learning_rate": 4.8167957445206066e-05, "loss": 0.5277, "num_input_tokens_seen": 1831136, "step": 5445 }, { "epoch": 4.211746522411128, "grad_norm": 0.6415258646011353, "learning_rate": 4.816161698218997e-05, "loss": 0.4796, "num_input_tokens_seen": 1832864, "step": 5450 }, { "epoch": 4.215610510046368, "grad_norm": 1.391902208328247, "learning_rate": 4.8155265985131756e-05, "loss": 0.4488, "num_input_tokens_seen": 1834528, "step": 5455 }, { "epoch": 4.219474497681608, "grad_norm": 0.9182360172271729, "learning_rate": 4.814890445691991e-05, "loss": 0.4478, "num_input_tokens_seen": 1836288, "step": 5460 }, { "epoch": 4.223338485316847, "grad_norm": 0.7020983695983887, "learning_rate": 4.8142532400447676e-05, "loss": 0.5993, "num_input_tokens_seen": 1838016, "step": 5465 }, { "epoch": 4.2272024729520865, "grad_norm": 1.2527307271957397, "learning_rate": 4.813614981861311e-05, "loss": 0.4527, "num_input_tokens_seen": 1839744, "step": 5470 }, { "epoch": 4.231066460587326, "grad_norm": 0.6775925755500793, "learning_rate": 4.8129756714319053e-05, "loss": 0.4904, "num_input_tokens_seen": 1841472, "step": 5475 }, { "epoch": 4.234930448222566, "grad_norm": 1.2910388708114624, "learning_rate": 4.812335309047312e-05, "loss": 0.7839, "num_input_tokens_seen": 1843360, "step": 5480 }, { "epoch": 4.238794435857805, "grad_norm": 0.5457465052604675, "learning_rate": 4.811693894998773e-05, "loss": 0.3577, "num_input_tokens_seen": 1845152, "step": 5485 }, { "epoch": 4.242658423493045, "grad_norm": 0.6392195224761963, "learning_rate": 4.8110514295780054e-05, "loss": 0.5378, "num_input_tokens_seen": 1847136, "step": 5490 }, { "epoch": 4.246522411128284, "grad_norm": 0.8636670708656311, "learning_rate": 4.810407913077208e-05, "loss": 0.4008, "num_input_tokens_seen": 1848960, "step": 5495 }, { "epoch": 4.250386398763524, "grad_norm": 0.6714338660240173, "learning_rate": 4.809763345789054e-05, "loss": 0.495, "num_input_tokens_seen": 1850592, "step": 5500 }, { "epoch": 4.2542503863987635, "grad_norm": 0.7907747626304626, "learning_rate": 4.809117728006699e-05, "loss": 0.4885, "num_input_tokens_seen": 1852480, "step": 5505 }, { "epoch": 4.258114374034003, "grad_norm": 1.2037914991378784, "learning_rate": 4.8084710600237726e-05, "loss": 0.6478, "num_input_tokens_seen": 1854400, "step": 5510 }, { "epoch": 4.261978361669243, "grad_norm": 1.2974226474761963, "learning_rate": 4.807823342134382e-05, "loss": 0.4946, "num_input_tokens_seen": 1856224, "step": 5515 }, { "epoch": 4.265842349304482, "grad_norm": 0.9265073537826538, "learning_rate": 4.807174574633115e-05, "loss": 0.7594, "num_input_tokens_seen": 1858176, "step": 5520 }, { "epoch": 4.269706336939722, "grad_norm": 0.6413447260856628, "learning_rate": 4.806524757815035e-05, "loss": 0.3979, "num_input_tokens_seen": 1859712, "step": 5525 }, { "epoch": 4.273570324574961, "grad_norm": 0.8704351186752319, "learning_rate": 4.8058738919756816e-05, "loss": 0.4164, "num_input_tokens_seen": 1861440, "step": 5530 }, { "epoch": 4.277434312210201, "grad_norm": 1.0833680629730225, "learning_rate": 4.805221977411072e-05, "loss": 0.5554, "num_input_tokens_seen": 1863104, "step": 5535 }, { "epoch": 4.2812982998454405, "grad_norm": 0.4944467544555664, "learning_rate": 4.804569014417703e-05, "loss": 0.3898, "num_input_tokens_seen": 1864576, "step": 5540 }, { "epoch": 4.28516228748068, "grad_norm": 0.9830129146575928, "learning_rate": 4.8039150032925433e-05, "loss": 0.4422, "num_input_tokens_seen": 1866304, "step": 5545 }, { "epoch": 4.289026275115919, "grad_norm": 0.7741503119468689, "learning_rate": 4.803259944333043e-05, "loss": 0.5282, "num_input_tokens_seen": 1867840, "step": 5550 }, { "epoch": 4.292890262751159, "grad_norm": 0.7996810674667358, "learning_rate": 4.8026038378371265e-05, "loss": 0.4719, "num_input_tokens_seen": 1869536, "step": 5555 }, { "epoch": 4.296754250386399, "grad_norm": 0.6062063574790955, "learning_rate": 4.8019466841031946e-05, "loss": 0.3966, "num_input_tokens_seen": 1871200, "step": 5560 }, { "epoch": 4.300618238021638, "grad_norm": 0.7165650129318237, "learning_rate": 4.8012884834301255e-05, "loss": 0.5248, "num_input_tokens_seen": 1872736, "step": 5565 }, { "epoch": 4.304482225656878, "grad_norm": 0.7055517435073853, "learning_rate": 4.800629236117272e-05, "loss": 0.449, "num_input_tokens_seen": 1874432, "step": 5570 }, { "epoch": 4.3083462132921175, "grad_norm": 1.1421610116958618, "learning_rate": 4.799968942464463e-05, "loss": 0.4807, "num_input_tokens_seen": 1876384, "step": 5575 }, { "epoch": 4.312210200927357, "grad_norm": 0.9509115815162659, "learning_rate": 4.799307602772006e-05, "loss": 0.7851, "num_input_tokens_seen": 1878176, "step": 5580 }, { "epoch": 4.316074188562597, "grad_norm": 1.829638123512268, "learning_rate": 4.7986452173406815e-05, "loss": 0.7202, "num_input_tokens_seen": 1879712, "step": 5585 }, { "epoch": 4.319938176197836, "grad_norm": 0.8460495471954346, "learning_rate": 4.797981786471746e-05, "loss": 0.5716, "num_input_tokens_seen": 1881440, "step": 5590 }, { "epoch": 4.323802163833076, "grad_norm": 0.8187834024429321, "learning_rate": 4.7973173104669314e-05, "loss": 0.7169, "num_input_tokens_seen": 1883296, "step": 5595 }, { "epoch": 4.327666151468315, "grad_norm": 0.737341046333313, "learning_rate": 4.796651789628446e-05, "loss": 0.4587, "num_input_tokens_seen": 1884896, "step": 5600 }, { "epoch": 4.331530139103555, "grad_norm": 0.6273441910743713, "learning_rate": 4.795985224258973e-05, "loss": 0.612, "num_input_tokens_seen": 1886720, "step": 5605 }, { "epoch": 4.3353941267387945, "grad_norm": 0.7366257905960083, "learning_rate": 4.7953176146616695e-05, "loss": 0.4899, "num_input_tokens_seen": 1888544, "step": 5610 }, { "epoch": 4.339258114374034, "grad_norm": 0.40619784593582153, "learning_rate": 4.794648961140169e-05, "loss": 0.765, "num_input_tokens_seen": 1890496, "step": 5615 }, { "epoch": 4.343122102009273, "grad_norm": 0.9377054572105408, "learning_rate": 4.793979263998578e-05, "loss": 0.5783, "num_input_tokens_seen": 1892256, "step": 5620 }, { "epoch": 4.346986089644513, "grad_norm": 0.5404183864593506, "learning_rate": 4.793308523541481e-05, "loss": 0.4963, "num_input_tokens_seen": 1893856, "step": 5625 }, { "epoch": 4.350850077279753, "grad_norm": 0.7270441651344299, "learning_rate": 4.792636740073932e-05, "loss": 0.5556, "num_input_tokens_seen": 1895680, "step": 5630 }, { "epoch": 4.354714064914992, "grad_norm": 1.4549683332443237, "learning_rate": 4.791963913901465e-05, "loss": 0.5046, "num_input_tokens_seen": 1897248, "step": 5635 }, { "epoch": 4.358578052550232, "grad_norm": 0.7071883082389832, "learning_rate": 4.791290045330083e-05, "loss": 0.3672, "num_input_tokens_seen": 1898848, "step": 5640 }, { "epoch": 4.3624420401854715, "grad_norm": 0.7704413533210754, "learning_rate": 4.7906151346662665e-05, "loss": 0.4259, "num_input_tokens_seen": 1900416, "step": 5645 }, { "epoch": 4.366306027820711, "grad_norm": 0.7957844138145447, "learning_rate": 4.7899391822169684e-05, "loss": 0.6405, "num_input_tokens_seen": 1902112, "step": 5650 }, { "epoch": 4.37017001545595, "grad_norm": 0.9130534529685974, "learning_rate": 4.7892621882896173e-05, "loss": 0.56, "num_input_tokens_seen": 1903712, "step": 5655 }, { "epoch": 4.37403400309119, "grad_norm": 0.6739981770515442, "learning_rate": 4.7885841531921126e-05, "loss": 0.5941, "num_input_tokens_seen": 1905248, "step": 5660 }, { "epoch": 4.37789799072643, "grad_norm": 0.7067570686340332, "learning_rate": 4.787905077232829e-05, "loss": 0.4219, "num_input_tokens_seen": 1906784, "step": 5665 }, { "epoch": 4.381761978361669, "grad_norm": 0.6558043956756592, "learning_rate": 4.7872249607206146e-05, "loss": 0.6197, "num_input_tokens_seen": 1908512, "step": 5670 }, { "epoch": 4.385625965996908, "grad_norm": 1.1996897459030151, "learning_rate": 4.7865438039647906e-05, "loss": 0.4556, "num_input_tokens_seen": 1910240, "step": 5675 }, { "epoch": 4.3894899536321486, "grad_norm": 0.7168941497802734, "learning_rate": 4.785861607275152e-05, "loss": 0.5892, "num_input_tokens_seen": 1912160, "step": 5680 }, { "epoch": 4.393353941267388, "grad_norm": 0.6483076214790344, "learning_rate": 4.7851783709619634e-05, "loss": 0.4197, "num_input_tokens_seen": 1913856, "step": 5685 }, { "epoch": 4.397217928902627, "grad_norm": 0.6685015559196472, "learning_rate": 4.784494095335966e-05, "loss": 0.4739, "num_input_tokens_seen": 1915296, "step": 5690 }, { "epoch": 4.401081916537867, "grad_norm": 1.1705269813537598, "learning_rate": 4.783808780708374e-05, "loss": 0.4223, "num_input_tokens_seen": 1917024, "step": 5695 }, { "epoch": 4.404945904173107, "grad_norm": 1.2116577625274658, "learning_rate": 4.783122427390871e-05, "loss": 0.5649, "num_input_tokens_seen": 1918784, "step": 5700 }, { "epoch": 4.408809891808346, "grad_norm": 0.8534665703773499, "learning_rate": 4.782435035695615e-05, "loss": 0.4948, "num_input_tokens_seen": 1920384, "step": 5705 }, { "epoch": 4.412673879443586, "grad_norm": 0.9482265710830688, "learning_rate": 4.781746605935236e-05, "loss": 0.4436, "num_input_tokens_seen": 1921888, "step": 5710 }, { "epoch": 4.416537867078826, "grad_norm": 0.6280705332756042, "learning_rate": 4.781057138422835e-05, "loss": 0.4525, "num_input_tokens_seen": 1923456, "step": 5715 }, { "epoch": 4.420401854714065, "grad_norm": 1.1052274703979492, "learning_rate": 4.780366633471987e-05, "loss": 0.4448, "num_input_tokens_seen": 1925376, "step": 5720 }, { "epoch": 4.424265842349304, "grad_norm": 0.9936651587486267, "learning_rate": 4.7796750913967374e-05, "loss": 0.6323, "num_input_tokens_seen": 1926944, "step": 5725 }, { "epoch": 4.428129829984544, "grad_norm": 0.6009629964828491, "learning_rate": 4.778982512511604e-05, "loss": 0.7283, "num_input_tokens_seen": 1928384, "step": 5730 }, { "epoch": 4.431993817619784, "grad_norm": 1.6747392416000366, "learning_rate": 4.778288897131576e-05, "loss": 0.658, "num_input_tokens_seen": 1930016, "step": 5735 }, { "epoch": 4.435857805255023, "grad_norm": 0.9713713526725769, "learning_rate": 4.777594245572113e-05, "loss": 0.4551, "num_input_tokens_seen": 1931584, "step": 5740 }, { "epoch": 4.439721792890262, "grad_norm": 0.7609559893608093, "learning_rate": 4.7768985581491474e-05, "loss": 0.4648, "num_input_tokens_seen": 1933120, "step": 5745 }, { "epoch": 4.443585780525503, "grad_norm": 1.4496846199035645, "learning_rate": 4.776201835179082e-05, "loss": 0.6943, "num_input_tokens_seen": 1934592, "step": 5750 }, { "epoch": 4.447449768160742, "grad_norm": 0.6344082355499268, "learning_rate": 4.7755040769787895e-05, "loss": 0.4008, "num_input_tokens_seen": 1936320, "step": 5755 }, { "epoch": 4.451313755795981, "grad_norm": 0.957955002784729, "learning_rate": 4.774805283865616e-05, "loss": 0.553, "num_input_tokens_seen": 1938112, "step": 5760 }, { "epoch": 4.455177743431221, "grad_norm": 0.7908819317817688, "learning_rate": 4.774105456157375e-05, "loss": 0.5158, "num_input_tokens_seen": 1939968, "step": 5765 }, { "epoch": 4.459041731066461, "grad_norm": 1.0048965215682983, "learning_rate": 4.773404594172355e-05, "loss": 0.5692, "num_input_tokens_seen": 1941728, "step": 5770 }, { "epoch": 4.4629057187017, "grad_norm": 0.6018010377883911, "learning_rate": 4.77270269822931e-05, "loss": 0.5045, "num_input_tokens_seen": 1943168, "step": 5775 }, { "epoch": 4.466769706336939, "grad_norm": 0.7279105186462402, "learning_rate": 4.771999768647467e-05, "loss": 0.4689, "num_input_tokens_seen": 1944928, "step": 5780 }, { "epoch": 4.47063369397218, "grad_norm": 1.1490614414215088, "learning_rate": 4.771295805746523e-05, "loss": 0.6149, "num_input_tokens_seen": 1946656, "step": 5785 }, { "epoch": 4.474497681607419, "grad_norm": 0.39847081899642944, "learning_rate": 4.770590809846644e-05, "loss": 0.4536, "num_input_tokens_seen": 1948256, "step": 5790 }, { "epoch": 4.478361669242658, "grad_norm": 0.9301323294639587, "learning_rate": 4.7698847812684663e-05, "loss": 0.4075, "num_input_tokens_seen": 1949888, "step": 5795 }, { "epoch": 4.4822256568778975, "grad_norm": 1.7020866870880127, "learning_rate": 4.769177720333097e-05, "loss": 0.564, "num_input_tokens_seen": 1951552, "step": 5800 }, { "epoch": 4.486089644513138, "grad_norm": 0.6930032968521118, "learning_rate": 4.768469627362111e-05, "loss": 0.4719, "num_input_tokens_seen": 1953184, "step": 5805 }, { "epoch": 4.489953632148377, "grad_norm": 0.77243572473526, "learning_rate": 4.767760502677553e-05, "loss": 0.6277, "num_input_tokens_seen": 1954784, "step": 5810 }, { "epoch": 4.493817619783616, "grad_norm": 0.5803499817848206, "learning_rate": 4.767050346601937e-05, "loss": 0.4428, "num_input_tokens_seen": 1956288, "step": 5815 }, { "epoch": 4.497681607418857, "grad_norm": 0.8222053050994873, "learning_rate": 4.7663391594582465e-05, "loss": 0.4007, "num_input_tokens_seen": 1958016, "step": 5820 }, { "epoch": 4.501545595054096, "grad_norm": 0.7688466906547546, "learning_rate": 4.7656269415699344e-05, "loss": 0.4404, "num_input_tokens_seen": 1959648, "step": 5825 }, { "epoch": 4.505409582689335, "grad_norm": 1.1673009395599365, "learning_rate": 4.7649136932609204e-05, "loss": 0.4446, "num_input_tokens_seen": 1961600, "step": 5830 }, { "epoch": 4.509273570324575, "grad_norm": 0.9033278822898865, "learning_rate": 4.7641994148555944e-05, "loss": 0.5656, "num_input_tokens_seen": 1963200, "step": 5835 }, { "epoch": 4.513137557959815, "grad_norm": 0.6729082465171814, "learning_rate": 4.7634841066788154e-05, "loss": 0.4914, "num_input_tokens_seen": 1964704, "step": 5840 }, { "epoch": 4.517001545595054, "grad_norm": 0.5731209516525269, "learning_rate": 4.762767769055909e-05, "loss": 0.4744, "num_input_tokens_seen": 1966496, "step": 5845 }, { "epoch": 4.520865533230293, "grad_norm": 1.3030903339385986, "learning_rate": 4.7620504023126697e-05, "loss": 0.5085, "num_input_tokens_seen": 1968224, "step": 5850 }, { "epoch": 4.524729520865534, "grad_norm": 0.7214072346687317, "learning_rate": 4.761332006775361e-05, "loss": 0.4434, "num_input_tokens_seen": 1969952, "step": 5855 }, { "epoch": 4.528593508500773, "grad_norm": 0.7951311469078064, "learning_rate": 4.7606125827707125e-05, "loss": 0.6074, "num_input_tokens_seen": 1971488, "step": 5860 }, { "epoch": 4.532457496136012, "grad_norm": 0.5387468934059143, "learning_rate": 4.7598921306259236e-05, "loss": 0.3638, "num_input_tokens_seen": 1973120, "step": 5865 }, { "epoch": 4.5363214837712516, "grad_norm": 1.357650876045227, "learning_rate": 4.7591706506686595e-05, "loss": 0.6553, "num_input_tokens_seen": 1974912, "step": 5870 }, { "epoch": 4.540185471406492, "grad_norm": 0.8873515725135803, "learning_rate": 4.7584481432270545e-05, "loss": 0.5301, "num_input_tokens_seen": 1976864, "step": 5875 }, { "epoch": 4.544049459041731, "grad_norm": 0.9269487261772156, "learning_rate": 4.757724608629708e-05, "loss": 0.4477, "num_input_tokens_seen": 1978656, "step": 5880 }, { "epoch": 4.54791344667697, "grad_norm": 0.6512171626091003, "learning_rate": 4.757000047205688e-05, "loss": 0.5615, "num_input_tokens_seen": 1980224, "step": 5885 }, { "epoch": 4.551777434312211, "grad_norm": 1.0856655836105347, "learning_rate": 4.756274459284531e-05, "loss": 0.4444, "num_input_tokens_seen": 1981824, "step": 5890 }, { "epoch": 4.55564142194745, "grad_norm": 0.7748255133628845, "learning_rate": 4.755547845196236e-05, "loss": 0.5089, "num_input_tokens_seen": 1983328, "step": 5895 }, { "epoch": 4.559505409582689, "grad_norm": 0.48968276381492615, "learning_rate": 4.754820205271275e-05, "loss": 0.545, "num_input_tokens_seen": 1984832, "step": 5900 }, { "epoch": 4.563369397217929, "grad_norm": 0.9722805619239807, "learning_rate": 4.75409153984058e-05, "loss": 0.5357, "num_input_tokens_seen": 1986560, "step": 5905 }, { "epoch": 4.567233384853169, "grad_norm": 0.8732215762138367, "learning_rate": 4.753361849235554e-05, "loss": 0.4036, "num_input_tokens_seen": 1988192, "step": 5910 }, { "epoch": 4.571097372488408, "grad_norm": 0.868105411529541, "learning_rate": 4.752631133788064e-05, "loss": 0.3833, "num_input_tokens_seen": 1990080, "step": 5915 }, { "epoch": 4.574961360123647, "grad_norm": 1.1493436098098755, "learning_rate": 4.751899393830443e-05, "loss": 0.5522, "num_input_tokens_seen": 1991648, "step": 5920 }, { "epoch": 4.578825347758887, "grad_norm": 0.4856592118740082, "learning_rate": 4.751166629695492e-05, "loss": 0.6742, "num_input_tokens_seen": 1993216, "step": 5925 }, { "epoch": 4.582689335394127, "grad_norm": 0.6741303205490112, "learning_rate": 4.7504328417164765e-05, "loss": 0.6355, "num_input_tokens_seen": 1994880, "step": 5930 }, { "epoch": 4.586553323029366, "grad_norm": 0.5733489394187927, "learning_rate": 4.749698030227127e-05, "loss": 0.403, "num_input_tokens_seen": 1996448, "step": 5935 }, { "epoch": 4.590417310664606, "grad_norm": 1.4829344749450684, "learning_rate": 4.74896219556164e-05, "loss": 0.6664, "num_input_tokens_seen": 1997984, "step": 5940 }, { "epoch": 4.594281298299846, "grad_norm": 0.5643581748008728, "learning_rate": 4.748225338054679e-05, "loss": 0.3967, "num_input_tokens_seen": 1999424, "step": 5945 }, { "epoch": 4.598145285935085, "grad_norm": 0.7846442461013794, "learning_rate": 4.74748745804137e-05, "loss": 0.5765, "num_input_tokens_seen": 2001056, "step": 5950 }, { "epoch": 4.602009273570324, "grad_norm": 1.6705886125564575, "learning_rate": 4.746748555857304e-05, "loss": 0.6103, "num_input_tokens_seen": 2002976, "step": 5955 }, { "epoch": 4.605873261205565, "grad_norm": 0.6543671488761902, "learning_rate": 4.746008631838541e-05, "loss": 0.3778, "num_input_tokens_seen": 2004704, "step": 5960 }, { "epoch": 4.609737248840804, "grad_norm": 0.7328267097473145, "learning_rate": 4.7452676863216015e-05, "loss": 0.3913, "num_input_tokens_seen": 2006400, "step": 5965 }, { "epoch": 4.613601236476043, "grad_norm": 1.4532653093338013, "learning_rate": 4.744525719643471e-05, "loss": 0.6202, "num_input_tokens_seen": 2008032, "step": 5970 }, { "epoch": 4.617465224111283, "grad_norm": 0.9956274032592773, "learning_rate": 4.743782732141602e-05, "loss": 0.4734, "num_input_tokens_seen": 2009728, "step": 5975 }, { "epoch": 4.621329211746523, "grad_norm": 0.9033043384552002, "learning_rate": 4.7430387241539085e-05, "loss": 0.4965, "num_input_tokens_seen": 2011616, "step": 5980 }, { "epoch": 4.625193199381762, "grad_norm": 0.9144066572189331, "learning_rate": 4.74229369601877e-05, "loss": 0.3969, "num_input_tokens_seen": 2013312, "step": 5985 }, { "epoch": 4.629057187017001, "grad_norm": 0.5892120003700256, "learning_rate": 4.74154764807503e-05, "loss": 0.4464, "num_input_tokens_seen": 2014816, "step": 5990 }, { "epoch": 4.632921174652241, "grad_norm": 0.5341910123825073, "learning_rate": 4.740800580661996e-05, "loss": 0.4012, "num_input_tokens_seen": 2016352, "step": 5995 }, { "epoch": 4.636785162287481, "grad_norm": 1.0032546520233154, "learning_rate": 4.740052494119439e-05, "loss": 0.4218, "num_input_tokens_seen": 2017888, "step": 6000 }, { "epoch": 4.64064914992272, "grad_norm": 0.6963416934013367, "learning_rate": 4.7393033887875916e-05, "loss": 0.6268, "num_input_tokens_seen": 2019584, "step": 6005 }, { "epoch": 4.64451313755796, "grad_norm": 1.578284502029419, "learning_rate": 4.738553265007152e-05, "loss": 0.7608, "num_input_tokens_seen": 2021088, "step": 6010 }, { "epoch": 4.6483771251932, "grad_norm": 0.8527438044548035, "learning_rate": 4.7378021231192815e-05, "loss": 0.4654, "num_input_tokens_seen": 2022816, "step": 6015 }, { "epoch": 4.652241112828439, "grad_norm": 0.8561051487922668, "learning_rate": 4.737049963465604e-05, "loss": 0.4404, "num_input_tokens_seen": 2024384, "step": 6020 }, { "epoch": 4.656105100463678, "grad_norm": 0.5341562032699585, "learning_rate": 4.7362967863882056e-05, "loss": 0.3606, "num_input_tokens_seen": 2025888, "step": 6025 }, { "epoch": 4.659969088098918, "grad_norm": 0.668375551700592, "learning_rate": 4.7355425922296364e-05, "loss": 0.4152, "num_input_tokens_seen": 2027584, "step": 6030 }, { "epoch": 4.663833075734158, "grad_norm": 0.7669928669929504, "learning_rate": 4.734787381332908e-05, "loss": 0.4104, "num_input_tokens_seen": 2029056, "step": 6035 }, { "epoch": 4.667697063369397, "grad_norm": 0.7220039367675781, "learning_rate": 4.734031154041495e-05, "loss": 0.4488, "num_input_tokens_seen": 2030912, "step": 6040 }, { "epoch": 4.671561051004637, "grad_norm": 0.5946566462516785, "learning_rate": 4.733273910699334e-05, "loss": 0.4193, "num_input_tokens_seen": 2032640, "step": 6045 }, { "epoch": 4.675425038639876, "grad_norm": 0.5543398857116699, "learning_rate": 4.732515651650824e-05, "loss": 0.4312, "num_input_tokens_seen": 2034112, "step": 6050 }, { "epoch": 4.679289026275116, "grad_norm": 1.428449273109436, "learning_rate": 4.7317563772408255e-05, "loss": 0.6237, "num_input_tokens_seen": 2036224, "step": 6055 }, { "epoch": 4.683153013910355, "grad_norm": 1.2671746015548706, "learning_rate": 4.730996087814662e-05, "loss": 0.3644, "num_input_tokens_seen": 2037792, "step": 6060 }, { "epoch": 4.687017001545595, "grad_norm": 1.1625111103057861, "learning_rate": 4.730234783718116e-05, "loss": 0.463, "num_input_tokens_seen": 2039520, "step": 6065 }, { "epoch": 4.690880989180835, "grad_norm": 1.1303023099899292, "learning_rate": 4.729472465297434e-05, "loss": 0.4319, "num_input_tokens_seen": 2041344, "step": 6070 }, { "epoch": 4.694744976816074, "grad_norm": 0.7742786407470703, "learning_rate": 4.7287091328993226e-05, "loss": 0.4473, "num_input_tokens_seen": 2043264, "step": 6075 }, { "epoch": 4.698608964451314, "grad_norm": 0.4642827808856964, "learning_rate": 4.727944786870951e-05, "loss": 0.4325, "num_input_tokens_seen": 2044992, "step": 6080 }, { "epoch": 4.702472952086554, "grad_norm": 0.9309966564178467, "learning_rate": 4.7271794275599477e-05, "loss": 0.4593, "num_input_tokens_seen": 2046592, "step": 6085 }, { "epoch": 4.706336939721793, "grad_norm": 1.6891334056854248, "learning_rate": 4.726413055314403e-05, "loss": 0.4611, "num_input_tokens_seen": 2048448, "step": 6090 }, { "epoch": 4.710200927357032, "grad_norm": 1.0250260829925537, "learning_rate": 4.725645670482866e-05, "loss": 0.486, "num_input_tokens_seen": 2049984, "step": 6095 }, { "epoch": 4.714064914992272, "grad_norm": 0.7428543567657471, "learning_rate": 4.72487727341435e-05, "loss": 0.6123, "num_input_tokens_seen": 2051648, "step": 6100 }, { "epoch": 4.717928902627512, "grad_norm": 0.9156472682952881, "learning_rate": 4.724107864458326e-05, "loss": 0.5361, "num_input_tokens_seen": 2053280, "step": 6105 }, { "epoch": 4.721792890262751, "grad_norm": 0.6366098523139954, "learning_rate": 4.723337443964725e-05, "loss": 0.8389, "num_input_tokens_seen": 2055424, "step": 6110 }, { "epoch": 4.725656877897991, "grad_norm": 0.6133863925933838, "learning_rate": 4.7225660122839396e-05, "loss": 0.4212, "num_input_tokens_seen": 2056928, "step": 6115 }, { "epoch": 4.72952086553323, "grad_norm": 1.6381926536560059, "learning_rate": 4.721793569766822e-05, "loss": 0.7141, "num_input_tokens_seen": 2058464, "step": 6120 }, { "epoch": 4.73338485316847, "grad_norm": 1.0245774984359741, "learning_rate": 4.721020116764683e-05, "loss": 0.42, "num_input_tokens_seen": 2059936, "step": 6125 }, { "epoch": 4.7372488408037094, "grad_norm": 1.331956148147583, "learning_rate": 4.720245653629293e-05, "loss": 0.8564, "num_input_tokens_seen": 2061600, "step": 6130 }, { "epoch": 4.741112828438949, "grad_norm": 0.7519360184669495, "learning_rate": 4.719470180712884e-05, "loss": 0.8382, "num_input_tokens_seen": 2063392, "step": 6135 }, { "epoch": 4.744976816074189, "grad_norm": 0.8459218740463257, "learning_rate": 4.718693698368144e-05, "loss": 0.4674, "num_input_tokens_seen": 2064800, "step": 6140 }, { "epoch": 4.748840803709428, "grad_norm": 0.5795201063156128, "learning_rate": 4.717916206948223e-05, "loss": 0.407, "num_input_tokens_seen": 2066528, "step": 6145 }, { "epoch": 4.752704791344668, "grad_norm": 0.7852379679679871, "learning_rate": 4.7171377068067294e-05, "loss": 0.5565, "num_input_tokens_seen": 2068224, "step": 6150 }, { "epoch": 4.756568778979907, "grad_norm": 0.8508298993110657, "learning_rate": 4.716358198297728e-05, "loss": 0.4844, "num_input_tokens_seen": 2069600, "step": 6155 }, { "epoch": 4.760432766615147, "grad_norm": 1.1499451398849487, "learning_rate": 4.715577681775744e-05, "loss": 0.5979, "num_input_tokens_seen": 2071232, "step": 6160 }, { "epoch": 4.7642967542503865, "grad_norm": 0.4408530294895172, "learning_rate": 4.714796157595763e-05, "loss": 0.5862, "num_input_tokens_seen": 2072800, "step": 6165 }, { "epoch": 4.768160741885626, "grad_norm": 0.9128597974777222, "learning_rate": 4.714013626113226e-05, "loss": 0.8024, "num_input_tokens_seen": 2074272, "step": 6170 }, { "epoch": 4.772024729520865, "grad_norm": 1.2592575550079346, "learning_rate": 4.713230087684032e-05, "loss": 0.4867, "num_input_tokens_seen": 2075776, "step": 6175 }, { "epoch": 4.775888717156105, "grad_norm": 1.0003180503845215, "learning_rate": 4.7124455426645396e-05, "loss": 0.5624, "num_input_tokens_seen": 2077408, "step": 6180 }, { "epoch": 4.779752704791345, "grad_norm": 0.779034435749054, "learning_rate": 4.7116599914115645e-05, "loss": 0.4336, "num_input_tokens_seen": 2079232, "step": 6185 }, { "epoch": 4.783616692426584, "grad_norm": 0.5337037444114685, "learning_rate": 4.7108734342823803e-05, "loss": 0.4973, "num_input_tokens_seen": 2080832, "step": 6190 }, { "epoch": 4.787480680061824, "grad_norm": 0.5484507083892822, "learning_rate": 4.7100858716347175e-05, "loss": 0.4329, "num_input_tokens_seen": 2082304, "step": 6195 }, { "epoch": 4.7913446676970635, "grad_norm": 0.6246550679206848, "learning_rate": 4.709297303826765e-05, "loss": 0.4641, "num_input_tokens_seen": 2084160, "step": 6200 }, { "epoch": 4.795208655332303, "grad_norm": 0.5935916900634766, "learning_rate": 4.708507731217168e-05, "loss": 0.4513, "num_input_tokens_seen": 2085440, "step": 6205 }, { "epoch": 4.799072642967543, "grad_norm": 0.5700545907020569, "learning_rate": 4.707717154165028e-05, "loss": 0.5835, "num_input_tokens_seen": 2087264, "step": 6210 }, { "epoch": 4.802936630602782, "grad_norm": 0.5843462347984314, "learning_rate": 4.7069255730299044e-05, "loss": 0.6009, "num_input_tokens_seen": 2089152, "step": 6215 }, { "epoch": 4.806800618238022, "grad_norm": 0.8773878812789917, "learning_rate": 4.706132988171814e-05, "loss": 0.5205, "num_input_tokens_seen": 2090848, "step": 6220 }, { "epoch": 4.810664605873261, "grad_norm": 0.920456051826477, "learning_rate": 4.705339399951229e-05, "loss": 0.5928, "num_input_tokens_seen": 2092896, "step": 6225 }, { "epoch": 4.814528593508501, "grad_norm": 1.1375662088394165, "learning_rate": 4.7045448087290763e-05, "loss": 0.4879, "num_input_tokens_seen": 2094432, "step": 6230 }, { "epoch": 4.8183925811437405, "grad_norm": 0.9281238317489624, "learning_rate": 4.703749214866744e-05, "loss": 0.5424, "num_input_tokens_seen": 2096352, "step": 6235 }, { "epoch": 4.82225656877898, "grad_norm": 1.1794207096099854, "learning_rate": 4.7029526187260694e-05, "loss": 0.4046, "num_input_tokens_seen": 2098272, "step": 6240 }, { "epoch": 4.826120556414219, "grad_norm": 0.77918940782547, "learning_rate": 4.702155020669352e-05, "loss": 0.7275, "num_input_tokens_seen": 2100064, "step": 6245 }, { "epoch": 4.829984544049459, "grad_norm": 0.5262653231620789, "learning_rate": 4.701356421059342e-05, "loss": 0.4409, "num_input_tokens_seen": 2101856, "step": 6250 }, { "epoch": 4.833848531684699, "grad_norm": 0.7374491095542908, "learning_rate": 4.700556820259249e-05, "loss": 0.4215, "num_input_tokens_seen": 2103424, "step": 6255 }, { "epoch": 4.837712519319938, "grad_norm": 0.763353705406189, "learning_rate": 4.6997562186327355e-05, "loss": 0.4333, "num_input_tokens_seen": 2104960, "step": 6260 }, { "epoch": 4.841576506955178, "grad_norm": 0.6314825415611267, "learning_rate": 4.6989546165439196e-05, "loss": 0.4891, "num_input_tokens_seen": 2106592, "step": 6265 }, { "epoch": 4.8454404945904175, "grad_norm": 0.7517908811569214, "learning_rate": 4.698152014357376e-05, "loss": 0.7916, "num_input_tokens_seen": 2108288, "step": 6270 }, { "epoch": 4.849304482225657, "grad_norm": 0.6995319128036499, "learning_rate": 4.697348412438131e-05, "loss": 0.6721, "num_input_tokens_seen": 2110144, "step": 6275 }, { "epoch": 4.853168469860896, "grad_norm": 1.4099335670471191, "learning_rate": 4.6965438111516685e-05, "loss": 0.5777, "num_input_tokens_seen": 2111712, "step": 6280 }, { "epoch": 4.857032457496136, "grad_norm": 0.6614497900009155, "learning_rate": 4.695738210863926e-05, "loss": 0.3685, "num_input_tokens_seen": 2113376, "step": 6285 }, { "epoch": 4.860896445131376, "grad_norm": 0.6675326824188232, "learning_rate": 4.694931611941297e-05, "loss": 0.4349, "num_input_tokens_seen": 2114912, "step": 6290 }, { "epoch": 4.864760432766615, "grad_norm": 1.0346176624298096, "learning_rate": 4.694124014750624e-05, "loss": 0.4991, "num_input_tokens_seen": 2116672, "step": 6295 }, { "epoch": 4.868624420401854, "grad_norm": 1.0195988416671753, "learning_rate": 4.69331541965921e-05, "loss": 0.4388, "num_input_tokens_seen": 2118272, "step": 6300 }, { "epoch": 4.8724884080370945, "grad_norm": 0.8536468744277954, "learning_rate": 4.6925058270348076e-05, "loss": 0.6109, "num_input_tokens_seen": 2119936, "step": 6305 }, { "epoch": 4.876352395672334, "grad_norm": 1.1500799655914307, "learning_rate": 4.691695237245625e-05, "loss": 0.508, "num_input_tokens_seen": 2121504, "step": 6310 }, { "epoch": 4.880216383307573, "grad_norm": 0.5266467332839966, "learning_rate": 4.690883650660323e-05, "loss": 0.433, "num_input_tokens_seen": 2123168, "step": 6315 }, { "epoch": 4.884080370942813, "grad_norm": 2.0006909370422363, "learning_rate": 4.690071067648016e-05, "loss": 0.4614, "num_input_tokens_seen": 2124704, "step": 6320 }, { "epoch": 4.887944358578053, "grad_norm": 1.1234065294265747, "learning_rate": 4.6892574885782714e-05, "loss": 0.6089, "num_input_tokens_seen": 2126368, "step": 6325 }, { "epoch": 4.891808346213292, "grad_norm": 0.753980815410614, "learning_rate": 4.68844291382111e-05, "loss": 0.4557, "num_input_tokens_seen": 2128160, "step": 6330 }, { "epoch": 4.895672333848532, "grad_norm": 0.8270154595375061, "learning_rate": 4.687627343747005e-05, "loss": 0.4082, "num_input_tokens_seen": 2129856, "step": 6335 }, { "epoch": 4.8995363214837715, "grad_norm": 0.6601927280426025, "learning_rate": 4.6868107787268835e-05, "loss": 0.4163, "num_input_tokens_seen": 2131584, "step": 6340 }, { "epoch": 4.903400309119011, "grad_norm": 1.3016000986099243, "learning_rate": 4.685993219132123e-05, "loss": 0.5249, "num_input_tokens_seen": 2133344, "step": 6345 }, { "epoch": 4.90726429675425, "grad_norm": 1.2174254655838013, "learning_rate": 4.685174665334556e-05, "loss": 0.5066, "num_input_tokens_seen": 2135104, "step": 6350 }, { "epoch": 4.91112828438949, "grad_norm": 0.6822585463523865, "learning_rate": 4.684355117706464e-05, "loss": 0.3663, "num_input_tokens_seen": 2136512, "step": 6355 }, { "epoch": 4.91499227202473, "grad_norm": 2.3739538192749023, "learning_rate": 4.683534576620583e-05, "loss": 0.4599, "num_input_tokens_seen": 2138304, "step": 6360 }, { "epoch": 4.918856259659969, "grad_norm": 0.8361785411834717, "learning_rate": 4.6827130424501e-05, "loss": 0.6018, "num_input_tokens_seen": 2140160, "step": 6365 }, { "epoch": 4.922720247295208, "grad_norm": 1.8375751972198486, "learning_rate": 4.6818905155686526e-05, "loss": 0.8538, "num_input_tokens_seen": 2142144, "step": 6370 }, { "epoch": 4.9265842349304485, "grad_norm": 0.6745392680168152, "learning_rate": 4.681066996350333e-05, "loss": 0.4468, "num_input_tokens_seen": 2143776, "step": 6375 }, { "epoch": 4.930448222565688, "grad_norm": 0.6684293746948242, "learning_rate": 4.6802424851696816e-05, "loss": 0.5247, "num_input_tokens_seen": 2145344, "step": 6380 }, { "epoch": 4.934312210200927, "grad_norm": 0.6235945820808411, "learning_rate": 4.6794169824016896e-05, "loss": 0.3865, "num_input_tokens_seen": 2147040, "step": 6385 }, { "epoch": 4.938176197836167, "grad_norm": 0.7796215415000916, "learning_rate": 4.678590488421803e-05, "loss": 0.475, "num_input_tokens_seen": 2148768, "step": 6390 }, { "epoch": 4.942040185471407, "grad_norm": 0.9817187190055847, "learning_rate": 4.6777630036059154e-05, "loss": 0.5479, "num_input_tokens_seen": 2150496, "step": 6395 }, { "epoch": 4.945904173106646, "grad_norm": 0.6559157967567444, "learning_rate": 4.676934528330371e-05, "loss": 0.6036, "num_input_tokens_seen": 2152256, "step": 6400 }, { "epoch": 4.949768160741885, "grad_norm": 0.7496868968009949, "learning_rate": 4.676105062971967e-05, "loss": 0.4346, "num_input_tokens_seen": 2154176, "step": 6405 }, { "epoch": 4.9536321483771255, "grad_norm": 0.9143094420433044, "learning_rate": 4.675274607907947e-05, "loss": 0.5256, "num_input_tokens_seen": 2155968, "step": 6410 }, { "epoch": 4.957496136012365, "grad_norm": 0.8422152400016785, "learning_rate": 4.6744431635160094e-05, "loss": 0.4109, "num_input_tokens_seen": 2157600, "step": 6415 }, { "epoch": 4.961360123647604, "grad_norm": 0.9787008762359619, "learning_rate": 4.673610730174298e-05, "loss": 0.5971, "num_input_tokens_seen": 2159328, "step": 6420 }, { "epoch": 4.9652241112828435, "grad_norm": 0.762127161026001, "learning_rate": 4.672777308261409e-05, "loss": 0.5604, "num_input_tokens_seen": 2161152, "step": 6425 }, { "epoch": 4.969088098918084, "grad_norm": 1.3478137254714966, "learning_rate": 4.6719428981563885e-05, "loss": 0.5666, "num_input_tokens_seen": 2162752, "step": 6430 }, { "epoch": 4.972952086553323, "grad_norm": 1.41093909740448, "learning_rate": 4.6711075002387304e-05, "loss": 0.5874, "num_input_tokens_seen": 2164192, "step": 6435 }, { "epoch": 4.976816074188562, "grad_norm": 0.781207799911499, "learning_rate": 4.6702711148883794e-05, "loss": 0.4373, "num_input_tokens_seen": 2166048, "step": 6440 }, { "epoch": 4.9806800618238025, "grad_norm": 0.504897952079773, "learning_rate": 4.669433742485727e-05, "loss": 0.4616, "num_input_tokens_seen": 2167680, "step": 6445 }, { "epoch": 4.984544049459042, "grad_norm": 1.0267986059188843, "learning_rate": 4.668595383411617e-05, "loss": 0.548, "num_input_tokens_seen": 2169120, "step": 6450 }, { "epoch": 4.988408037094281, "grad_norm": 0.6210469603538513, "learning_rate": 4.6677560380473396e-05, "loss": 0.3417, "num_input_tokens_seen": 2170912, "step": 6455 }, { "epoch": 4.992272024729521, "grad_norm": 1.283807635307312, "learning_rate": 4.666915706774634e-05, "loss": 0.6984, "num_input_tokens_seen": 2172672, "step": 6460 }, { "epoch": 4.996136012364761, "grad_norm": 0.6813002228736877, "learning_rate": 4.6660743899756875e-05, "loss": 0.4015, "num_input_tokens_seen": 2174176, "step": 6465 }, { "epoch": 5.0, "grad_norm": 0.43830108642578125, "learning_rate": 4.665232088033136e-05, "loss": 0.3868, "num_input_tokens_seen": 2175568, "step": 6470 }, { "epoch": 5.0, "eval_loss": 0.514586329460144, "eval_runtime": 6.364, "eval_samples_per_second": 90.352, "eval_steps_per_second": 22.627, "num_input_tokens_seen": 2175568, "step": 6470 }, { "epoch": 5.003863987635239, "grad_norm": 0.8676257133483887, "learning_rate": 4.664388801330064e-05, "loss": 0.4733, "num_input_tokens_seen": 2177584, "step": 6475 }, { "epoch": 5.0077279752704795, "grad_norm": 0.9785415530204773, "learning_rate": 4.663544530250004e-05, "loss": 0.5722, "num_input_tokens_seen": 2179184, "step": 6480 }, { "epoch": 5.011591962905719, "grad_norm": 0.7063177227973938, "learning_rate": 4.662699275176934e-05, "loss": 0.4139, "num_input_tokens_seen": 2180816, "step": 6485 }, { "epoch": 5.015455950540958, "grad_norm": 0.860931932926178, "learning_rate": 4.661853036495281e-05, "loss": 0.48, "num_input_tokens_seen": 2182320, "step": 6490 }, { "epoch": 5.0193199381761975, "grad_norm": 0.7691118121147156, "learning_rate": 4.661005814589921e-05, "loss": 0.3869, "num_input_tokens_seen": 2183888, "step": 6495 }, { "epoch": 5.023183925811438, "grad_norm": 0.798329770565033, "learning_rate": 4.660157609846175e-05, "loss": 0.3593, "num_input_tokens_seen": 2185648, "step": 6500 }, { "epoch": 5.027047913446677, "grad_norm": 0.9490416049957275, "learning_rate": 4.659308422649811e-05, "loss": 0.465, "num_input_tokens_seen": 2187344, "step": 6505 }, { "epoch": 5.030911901081916, "grad_norm": 0.5633112788200378, "learning_rate": 4.6584582533870445e-05, "loss": 0.6135, "num_input_tokens_seen": 2189104, "step": 6510 }, { "epoch": 5.0347758887171565, "grad_norm": 0.9297870397567749, "learning_rate": 4.657607102444538e-05, "loss": 0.6932, "num_input_tokens_seen": 2191056, "step": 6515 }, { "epoch": 5.038639876352396, "grad_norm": 0.8881656527519226, "learning_rate": 4.656754970209401e-05, "loss": 0.3928, "num_input_tokens_seen": 2192752, "step": 6520 }, { "epoch": 5.042503863987635, "grad_norm": 0.5980252623558044, "learning_rate": 4.655901857069186e-05, "loss": 0.4527, "num_input_tokens_seen": 2194384, "step": 6525 }, { "epoch": 5.0463678516228745, "grad_norm": 0.7851317524909973, "learning_rate": 4.655047763411895e-05, "loss": 0.4559, "num_input_tokens_seen": 2196208, "step": 6530 }, { "epoch": 5.050231839258115, "grad_norm": 0.7313240766525269, "learning_rate": 4.654192689625976e-05, "loss": 0.4894, "num_input_tokens_seen": 2198000, "step": 6535 }, { "epoch": 5.054095826893354, "grad_norm": 0.8897801637649536, "learning_rate": 4.6533366361003204e-05, "loss": 0.5092, "num_input_tokens_seen": 2199984, "step": 6540 }, { "epoch": 5.057959814528593, "grad_norm": 0.8944630026817322, "learning_rate": 4.652479603224267e-05, "loss": 0.5189, "num_input_tokens_seen": 2201616, "step": 6545 }, { "epoch": 5.061823802163833, "grad_norm": 1.1340596675872803, "learning_rate": 4.651621591387599e-05, "loss": 0.6187, "num_input_tokens_seen": 2203280, "step": 6550 }, { "epoch": 5.065687789799073, "grad_norm": 0.9572818279266357, "learning_rate": 4.650762600980546e-05, "loss": 0.4138, "num_input_tokens_seen": 2205072, "step": 6555 }, { "epoch": 5.069551777434312, "grad_norm": 0.5847717523574829, "learning_rate": 4.6499026323937824e-05, "loss": 0.3989, "num_input_tokens_seen": 2206768, "step": 6560 }, { "epoch": 5.0734157650695515, "grad_norm": 0.5827094912528992, "learning_rate": 4.649041686018425e-05, "loss": 0.5765, "num_input_tokens_seen": 2208688, "step": 6565 }, { "epoch": 5.077279752704792, "grad_norm": 0.6819182634353638, "learning_rate": 4.6481797622460394e-05, "loss": 0.3945, "num_input_tokens_seen": 2210256, "step": 6570 }, { "epoch": 5.081143740340031, "grad_norm": 0.5552595257759094, "learning_rate": 4.647316861468633e-05, "loss": 0.5864, "num_input_tokens_seen": 2212016, "step": 6575 }, { "epoch": 5.08500772797527, "grad_norm": 1.0813068151474, "learning_rate": 4.646452984078658e-05, "loss": 0.7435, "num_input_tokens_seen": 2213648, "step": 6580 }, { "epoch": 5.08887171561051, "grad_norm": 0.5288902521133423, "learning_rate": 4.6455881304690116e-05, "loss": 0.3882, "num_input_tokens_seen": 2215408, "step": 6585 }, { "epoch": 5.09273570324575, "grad_norm": 0.7328352332115173, "learning_rate": 4.6447223010330334e-05, "loss": 0.4518, "num_input_tokens_seen": 2216976, "step": 6590 }, { "epoch": 5.096599690880989, "grad_norm": 1.253123164176941, "learning_rate": 4.6438554961645084e-05, "loss": 0.4458, "num_input_tokens_seen": 2218512, "step": 6595 }, { "epoch": 5.1004636785162285, "grad_norm": 0.8120790123939514, "learning_rate": 4.642987716257665e-05, "loss": 0.427, "num_input_tokens_seen": 2220336, "step": 6600 }, { "epoch": 5.104327666151469, "grad_norm": 0.8699557185173035, "learning_rate": 4.6421189617071754e-05, "loss": 0.4482, "num_input_tokens_seen": 2221808, "step": 6605 }, { "epoch": 5.108191653786708, "grad_norm": 1.256744623184204, "learning_rate": 4.6412492329081524e-05, "loss": 0.8174, "num_input_tokens_seen": 2223568, "step": 6610 }, { "epoch": 5.112055641421947, "grad_norm": 0.5059274435043335, "learning_rate": 4.640378530256155e-05, "loss": 0.5577, "num_input_tokens_seen": 2225008, "step": 6615 }, { "epoch": 5.115919629057187, "grad_norm": 0.7494547963142395, "learning_rate": 4.6395068541471834e-05, "loss": 0.4502, "num_input_tokens_seen": 2226640, "step": 6620 }, { "epoch": 5.119783616692427, "grad_norm": 1.1376112699508667, "learning_rate": 4.638634204977682e-05, "loss": 0.5159, "num_input_tokens_seen": 2228336, "step": 6625 }, { "epoch": 5.123647604327666, "grad_norm": 0.5658906102180481, "learning_rate": 4.637760583144536e-05, "loss": 0.5082, "num_input_tokens_seen": 2230288, "step": 6630 }, { "epoch": 5.1275115919629055, "grad_norm": 0.780434250831604, "learning_rate": 4.636885989045074e-05, "loss": 0.6057, "num_input_tokens_seen": 2232048, "step": 6635 }, { "epoch": 5.131375579598146, "grad_norm": 0.9706775546073914, "learning_rate": 4.6360104230770685e-05, "loss": 0.4715, "num_input_tokens_seen": 2233616, "step": 6640 }, { "epoch": 5.135239567233385, "grad_norm": 0.6418018341064453, "learning_rate": 4.635133885638729e-05, "loss": 0.5592, "num_input_tokens_seen": 2235504, "step": 6645 }, { "epoch": 5.139103554868624, "grad_norm": 0.6434085369110107, "learning_rate": 4.634256377128712e-05, "loss": 0.441, "num_input_tokens_seen": 2237360, "step": 6650 }, { "epoch": 5.142967542503864, "grad_norm": 0.6281254887580872, "learning_rate": 4.633377897946113e-05, "loss": 0.4074, "num_input_tokens_seen": 2239248, "step": 6655 }, { "epoch": 5.146831530139104, "grad_norm": 1.9723566770553589, "learning_rate": 4.6324984484904696e-05, "loss": 0.5635, "num_input_tokens_seen": 2240848, "step": 6660 }, { "epoch": 5.150695517774343, "grad_norm": 0.6863687634468079, "learning_rate": 4.631618029161761e-05, "loss": 0.657, "num_input_tokens_seen": 2242224, "step": 6665 }, { "epoch": 5.1545595054095825, "grad_norm": 0.596149742603302, "learning_rate": 4.630736640360407e-05, "loss": 0.4851, "num_input_tokens_seen": 2244272, "step": 6670 }, { "epoch": 5.158423493044822, "grad_norm": 0.5627493858337402, "learning_rate": 4.629854282487268e-05, "loss": 0.5409, "num_input_tokens_seen": 2246128, "step": 6675 }, { "epoch": 5.162287480680062, "grad_norm": 0.4752129316329956, "learning_rate": 4.6289709559436466e-05, "loss": 0.4365, "num_input_tokens_seen": 2248016, "step": 6680 }, { "epoch": 5.166151468315301, "grad_norm": 0.9285341501235962, "learning_rate": 4.6280866611312846e-05, "loss": 0.4353, "num_input_tokens_seen": 2249584, "step": 6685 }, { "epoch": 5.170015455950541, "grad_norm": 0.9108492732048035, "learning_rate": 4.627201398452364e-05, "loss": 0.4339, "num_input_tokens_seen": 2251120, "step": 6690 }, { "epoch": 5.173879443585781, "grad_norm": 1.105395793914795, "learning_rate": 4.626315168309509e-05, "loss": 0.4077, "num_input_tokens_seen": 2252880, "step": 6695 }, { "epoch": 5.17774343122102, "grad_norm": 1.1393561363220215, "learning_rate": 4.6254279711057804e-05, "loss": 0.5084, "num_input_tokens_seen": 2254320, "step": 6700 }, { "epoch": 5.1816074188562595, "grad_norm": 0.6135744452476501, "learning_rate": 4.624539807244682e-05, "loss": 0.3884, "num_input_tokens_seen": 2255952, "step": 6705 }, { "epoch": 5.185471406491499, "grad_norm": 0.7249361872673035, "learning_rate": 4.623650677130157e-05, "loss": 0.3835, "num_input_tokens_seen": 2257584, "step": 6710 }, { "epoch": 5.189335394126739, "grad_norm": 0.7422733902931213, "learning_rate": 4.622760581166585e-05, "loss": 0.5098, "num_input_tokens_seen": 2259472, "step": 6715 }, { "epoch": 5.193199381761978, "grad_norm": 0.6724643111228943, "learning_rate": 4.621869519758788e-05, "loss": 0.6079, "num_input_tokens_seen": 2261264, "step": 6720 }, { "epoch": 5.197063369397218, "grad_norm": 1.183847427368164, "learning_rate": 4.620977493312026e-05, "loss": 1.0739, "num_input_tokens_seen": 2263152, "step": 6725 }, { "epoch": 5.200927357032458, "grad_norm": 0.8424686193466187, "learning_rate": 4.6200845022319985e-05, "loss": 0.5928, "num_input_tokens_seen": 2264752, "step": 6730 }, { "epoch": 5.204791344667697, "grad_norm": 0.8158390522003174, "learning_rate": 4.619190546924843e-05, "loss": 0.5089, "num_input_tokens_seen": 2266416, "step": 6735 }, { "epoch": 5.2086553323029365, "grad_norm": 0.9268420338630676, "learning_rate": 4.6182956277971346e-05, "loss": 0.6196, "num_input_tokens_seen": 2268208, "step": 6740 }, { "epoch": 5.212519319938176, "grad_norm": 0.7769297957420349, "learning_rate": 4.617399745255889e-05, "loss": 0.4458, "num_input_tokens_seen": 2270000, "step": 6745 }, { "epoch": 5.216383307573416, "grad_norm": 1.41786789894104, "learning_rate": 4.616502899708558e-05, "loss": 0.4235, "num_input_tokens_seen": 2271568, "step": 6750 }, { "epoch": 5.220247295208655, "grad_norm": 0.9191035628318787, "learning_rate": 4.615605091563033e-05, "loss": 0.3753, "num_input_tokens_seen": 2273104, "step": 6755 }, { "epoch": 5.224111282843895, "grad_norm": 0.7322385907173157, "learning_rate": 4.614706321227644e-05, "loss": 0.4162, "num_input_tokens_seen": 2274672, "step": 6760 }, { "epoch": 5.227975270479135, "grad_norm": 1.1396411657333374, "learning_rate": 4.613806589111155e-05, "loss": 0.3772, "num_input_tokens_seen": 2276208, "step": 6765 }, { "epoch": 5.231839258114374, "grad_norm": 0.7852594256401062, "learning_rate": 4.6129058956227695e-05, "loss": 0.5936, "num_input_tokens_seen": 2278288, "step": 6770 }, { "epoch": 5.2357032457496135, "grad_norm": 1.6026842594146729, "learning_rate": 4.612004241172129e-05, "loss": 0.4405, "num_input_tokens_seen": 2280112, "step": 6775 }, { "epoch": 5.239567233384853, "grad_norm": 1.643273949623108, "learning_rate": 4.6111016261693116e-05, "loss": 0.4503, "num_input_tokens_seen": 2281840, "step": 6780 }, { "epoch": 5.243431221020093, "grad_norm": 0.9525704979896545, "learning_rate": 4.610198051024832e-05, "loss": 0.4544, "num_input_tokens_seen": 2283440, "step": 6785 }, { "epoch": 5.247295208655332, "grad_norm": 0.8766576647758484, "learning_rate": 4.609293516149641e-05, "loss": 0.4725, "num_input_tokens_seen": 2285040, "step": 6790 }, { "epoch": 5.251159196290572, "grad_norm": 1.5806865692138672, "learning_rate": 4.6083880219551265e-05, "loss": 0.4673, "num_input_tokens_seen": 2286576, "step": 6795 }, { "epoch": 5.255023183925811, "grad_norm": 0.913652241230011, "learning_rate": 4.607481568853114e-05, "loss": 0.4127, "num_input_tokens_seen": 2287984, "step": 6800 }, { "epoch": 5.258887171561051, "grad_norm": 0.6793085336685181, "learning_rate": 4.6065741572558616e-05, "loss": 0.5997, "num_input_tokens_seen": 2289584, "step": 6805 }, { "epoch": 5.2627511591962906, "grad_norm": 1.3542842864990234, "learning_rate": 4.605665787576068e-05, "loss": 0.723, "num_input_tokens_seen": 2291280, "step": 6810 }, { "epoch": 5.26661514683153, "grad_norm": 2.023494005203247, "learning_rate": 4.6047564602268626e-05, "loss": 0.5361, "num_input_tokens_seen": 2292752, "step": 6815 }, { "epoch": 5.27047913446677, "grad_norm": 1.141727328300476, "learning_rate": 4.603846175621816e-05, "loss": 0.5356, "num_input_tokens_seen": 2294576, "step": 6820 }, { "epoch": 5.274343122102009, "grad_norm": 1.8562073707580566, "learning_rate": 4.602934934174927e-05, "loss": 0.7264, "num_input_tokens_seen": 2296016, "step": 6825 }, { "epoch": 5.278207109737249, "grad_norm": 1.4275842905044556, "learning_rate": 4.6020227363006375e-05, "loss": 0.4798, "num_input_tokens_seen": 2297616, "step": 6830 }, { "epoch": 5.282071097372488, "grad_norm": 0.6231854557991028, "learning_rate": 4.601109582413818e-05, "loss": 0.5492, "num_input_tokens_seen": 2299216, "step": 6835 }, { "epoch": 5.285935085007728, "grad_norm": 0.9370237588882446, "learning_rate": 4.600195472929778e-05, "loss": 0.5822, "num_input_tokens_seen": 2300752, "step": 6840 }, { "epoch": 5.289799072642968, "grad_norm": 0.8839071393013, "learning_rate": 4.5992804082642594e-05, "loss": 0.4525, "num_input_tokens_seen": 2302384, "step": 6845 }, { "epoch": 5.293663060278207, "grad_norm": 1.1991002559661865, "learning_rate": 4.5983643888334385e-05, "loss": 0.5505, "num_input_tokens_seen": 2304176, "step": 6850 }, { "epoch": 5.297527047913447, "grad_norm": 0.6983346343040466, "learning_rate": 4.597447415053927e-05, "loss": 0.4964, "num_input_tokens_seen": 2305712, "step": 6855 }, { "epoch": 5.301391035548686, "grad_norm": 0.5130671858787537, "learning_rate": 4.59652948734277e-05, "loss": 0.431, "num_input_tokens_seen": 2307280, "step": 6860 }, { "epoch": 5.305255023183926, "grad_norm": 0.8036712408065796, "learning_rate": 4.5956106061174476e-05, "loss": 0.3751, "num_input_tokens_seen": 2308912, "step": 6865 }, { "epoch": 5.309119010819165, "grad_norm": 0.4709181785583496, "learning_rate": 4.59469077179587e-05, "loss": 0.5715, "num_input_tokens_seen": 2310480, "step": 6870 }, { "epoch": 5.312982998454405, "grad_norm": 0.7674537301063538, "learning_rate": 4.593769984796385e-05, "loss": 0.4273, "num_input_tokens_seen": 2312432, "step": 6875 }, { "epoch": 5.316846986089645, "grad_norm": 0.6692835092544556, "learning_rate": 4.592848245537773e-05, "loss": 0.4433, "num_input_tokens_seen": 2314160, "step": 6880 }, { "epoch": 5.320710973724884, "grad_norm": 0.7479203939437866, "learning_rate": 4.591925554439244e-05, "loss": 0.3733, "num_input_tokens_seen": 2315696, "step": 6885 }, { "epoch": 5.324574961360124, "grad_norm": 1.4451332092285156, "learning_rate": 4.5910019119204456e-05, "loss": 0.7195, "num_input_tokens_seen": 2317424, "step": 6890 }, { "epoch": 5.328438948995363, "grad_norm": 0.8261155486106873, "learning_rate": 4.5900773184014546e-05, "loss": 0.5281, "num_input_tokens_seen": 2319056, "step": 6895 }, { "epoch": 5.332302936630603, "grad_norm": 0.578213095664978, "learning_rate": 4.5891517743027824e-05, "loss": 0.3719, "num_input_tokens_seen": 2320688, "step": 6900 }, { "epoch": 5.336166924265842, "grad_norm": 1.198679804801941, "learning_rate": 4.5882252800453726e-05, "loss": 0.402, "num_input_tokens_seen": 2322416, "step": 6905 }, { "epoch": 5.340030911901082, "grad_norm": 0.7642441391944885, "learning_rate": 4.587297836050598e-05, "loss": 0.3723, "num_input_tokens_seen": 2324080, "step": 6910 }, { "epoch": 5.343894899536322, "grad_norm": 0.6018533706665039, "learning_rate": 4.5863694427402684e-05, "loss": 0.4906, "num_input_tokens_seen": 2325680, "step": 6915 }, { "epoch": 5.347758887171561, "grad_norm": 1.1146488189697266, "learning_rate": 4.5854401005366206e-05, "loss": 0.4905, "num_input_tokens_seen": 2327216, "step": 6920 }, { "epoch": 5.3516228748068, "grad_norm": 0.6668574810028076, "learning_rate": 4.584509809862327e-05, "loss": 0.4428, "num_input_tokens_seen": 2328848, "step": 6925 }, { "epoch": 5.35548686244204, "grad_norm": 0.7878247499465942, "learning_rate": 4.583578571140488e-05, "loss": 0.6172, "num_input_tokens_seen": 2330480, "step": 6930 }, { "epoch": 5.35935085007728, "grad_norm": 0.734015703201294, "learning_rate": 4.582646384794636e-05, "loss": 0.4358, "num_input_tokens_seen": 2332144, "step": 6935 }, { "epoch": 5.363214837712519, "grad_norm": 0.5567977428436279, "learning_rate": 4.581713251248736e-05, "loss": 0.4394, "num_input_tokens_seen": 2333712, "step": 6940 }, { "epoch": 5.367078825347759, "grad_norm": 0.6971529126167297, "learning_rate": 4.580779170927183e-05, "loss": 0.5304, "num_input_tokens_seen": 2335600, "step": 6945 }, { "epoch": 5.370942812982999, "grad_norm": 0.6576459407806396, "learning_rate": 4.5798441442548014e-05, "loss": 0.4378, "num_input_tokens_seen": 2337104, "step": 6950 }, { "epoch": 5.374806800618238, "grad_norm": 1.0400656461715698, "learning_rate": 4.5789081716568474e-05, "loss": 0.5929, "num_input_tokens_seen": 2338800, "step": 6955 }, { "epoch": 5.378670788253477, "grad_norm": 0.5178045034408569, "learning_rate": 4.577971253559006e-05, "loss": 0.4193, "num_input_tokens_seen": 2340496, "step": 6960 }, { "epoch": 5.382534775888717, "grad_norm": 1.0610209703445435, "learning_rate": 4.5770333903873955e-05, "loss": 0.392, "num_input_tokens_seen": 2342256, "step": 6965 }, { "epoch": 5.386398763523957, "grad_norm": 0.6289437413215637, "learning_rate": 4.576094582568558e-05, "loss": 0.468, "num_input_tokens_seen": 2343792, "step": 6970 }, { "epoch": 5.390262751159196, "grad_norm": 1.5364797115325928, "learning_rate": 4.575154830529473e-05, "loss": 0.4831, "num_input_tokens_seen": 2345392, "step": 6975 }, { "epoch": 5.394126738794436, "grad_norm": 0.770975649356842, "learning_rate": 4.574214134697543e-05, "loss": 0.4555, "num_input_tokens_seen": 2346800, "step": 6980 }, { "epoch": 5.397990726429676, "grad_norm": 0.84355628490448, "learning_rate": 4.573272495500602e-05, "loss": 0.4239, "num_input_tokens_seen": 2348528, "step": 6985 }, { "epoch": 5.401854714064915, "grad_norm": 0.9244521856307983, "learning_rate": 4.572329913366915e-05, "loss": 0.6452, "num_input_tokens_seen": 2350512, "step": 6990 }, { "epoch": 5.405718701700154, "grad_norm": 0.6060444116592407, "learning_rate": 4.571386388725172e-05, "loss": 0.4432, "num_input_tokens_seen": 2352240, "step": 6995 }, { "epoch": 5.409582689335394, "grad_norm": 0.6820939183235168, "learning_rate": 4.570441922004494e-05, "loss": 0.437, "num_input_tokens_seen": 2354096, "step": 7000 }, { "epoch": 5.413446676970634, "grad_norm": 1.882649302482605, "learning_rate": 4.5694965136344305e-05, "loss": 0.4365, "num_input_tokens_seen": 2355728, "step": 7005 }, { "epoch": 5.417310664605873, "grad_norm": 1.2973829507827759, "learning_rate": 4.568550164044959e-05, "loss": 0.7041, "num_input_tokens_seen": 2357456, "step": 7010 }, { "epoch": 5.421174652241113, "grad_norm": 1.4079736471176147, "learning_rate": 4.567602873666486e-05, "loss": 0.6628, "num_input_tokens_seen": 2359088, "step": 7015 }, { "epoch": 5.425038639876353, "grad_norm": 0.7418900728225708, "learning_rate": 4.5666546429298415e-05, "loss": 0.4023, "num_input_tokens_seen": 2360880, "step": 7020 }, { "epoch": 5.428902627511592, "grad_norm": 1.1008572578430176, "learning_rate": 4.56570547226629e-05, "loss": 0.5108, "num_input_tokens_seen": 2362608, "step": 7025 }, { "epoch": 5.432766615146831, "grad_norm": 1.1519477367401123, "learning_rate": 4.5647553621075184e-05, "loss": 0.4351, "num_input_tokens_seen": 2364112, "step": 7030 }, { "epoch": 5.436630602782071, "grad_norm": 0.6312509775161743, "learning_rate": 4.5638043128856436e-05, "loss": 0.4072, "num_input_tokens_seen": 2365904, "step": 7035 }, { "epoch": 5.440494590417311, "grad_norm": 0.885034441947937, "learning_rate": 4.5628523250332065e-05, "loss": 0.5539, "num_input_tokens_seen": 2367632, "step": 7040 }, { "epoch": 5.44435857805255, "grad_norm": 1.0212544202804565, "learning_rate": 4.5618993989831785e-05, "loss": 0.4369, "num_input_tokens_seen": 2369456, "step": 7045 }, { "epoch": 5.448222565687789, "grad_norm": 1.0311279296875, "learning_rate": 4.560945535168956e-05, "loss": 0.5667, "num_input_tokens_seen": 2371184, "step": 7050 }, { "epoch": 5.45208655332303, "grad_norm": 1.059944987297058, "learning_rate": 4.559990734024361e-05, "loss": 0.3818, "num_input_tokens_seen": 2372784, "step": 7055 }, { "epoch": 5.455950540958269, "grad_norm": 0.8776320219039917, "learning_rate": 4.559034995983643e-05, "loss": 0.813, "num_input_tokens_seen": 2374448, "step": 7060 }, { "epoch": 5.459814528593508, "grad_norm": 1.2975248098373413, "learning_rate": 4.558078321481478e-05, "loss": 0.6158, "num_input_tokens_seen": 2376208, "step": 7065 }, { "epoch": 5.4636785162287484, "grad_norm": 0.5187006592750549, "learning_rate": 4.557120710952968e-05, "loss": 0.3968, "num_input_tokens_seen": 2377872, "step": 7070 }, { "epoch": 5.467542503863988, "grad_norm": 1.1874914169311523, "learning_rate": 4.556162164833638e-05, "loss": 0.4658, "num_input_tokens_seen": 2379536, "step": 7075 }, { "epoch": 5.471406491499227, "grad_norm": 0.8533539175987244, "learning_rate": 4.5552026835594416e-05, "loss": 0.548, "num_input_tokens_seen": 2381200, "step": 7080 }, { "epoch": 5.475270479134466, "grad_norm": 0.9536433815956116, "learning_rate": 4.554242267566757e-05, "loss": 0.5924, "num_input_tokens_seen": 2382864, "step": 7085 }, { "epoch": 5.479134466769707, "grad_norm": 0.9030792117118835, "learning_rate": 4.553280917292387e-05, "loss": 0.6605, "num_input_tokens_seen": 2384656, "step": 7090 }, { "epoch": 5.482998454404946, "grad_norm": 0.7018606662750244, "learning_rate": 4.552318633173559e-05, "loss": 0.525, "num_input_tokens_seen": 2386256, "step": 7095 }, { "epoch": 5.486862442040185, "grad_norm": 0.6785745620727539, "learning_rate": 4.551355415647925e-05, "loss": 0.4547, "num_input_tokens_seen": 2387888, "step": 7100 }, { "epoch": 5.490726429675425, "grad_norm": 0.9608919024467468, "learning_rate": 4.550391265153564e-05, "loss": 0.4731, "num_input_tokens_seen": 2389744, "step": 7105 }, { "epoch": 5.494590417310665, "grad_norm": 1.1640933752059937, "learning_rate": 4.5494261821289755e-05, "loss": 0.8221, "num_input_tokens_seen": 2391536, "step": 7110 }, { "epoch": 5.498454404945904, "grad_norm": 0.6235552430152893, "learning_rate": 4.548460167013086e-05, "loss": 0.502, "num_input_tokens_seen": 2393424, "step": 7115 }, { "epoch": 5.502318392581143, "grad_norm": 0.8962815403938293, "learning_rate": 4.547493220245245e-05, "loss": 0.4862, "num_input_tokens_seen": 2395024, "step": 7120 }, { "epoch": 5.506182380216384, "grad_norm": 1.2649881839752197, "learning_rate": 4.5465253422652254e-05, "loss": 0.9743, "num_input_tokens_seen": 2396688, "step": 7125 }, { "epoch": 5.510046367851623, "grad_norm": 1.1332710981369019, "learning_rate": 4.545556533513224e-05, "loss": 0.3993, "num_input_tokens_seen": 2398320, "step": 7130 }, { "epoch": 5.513910355486862, "grad_norm": 0.7015656232833862, "learning_rate": 4.54458679442986e-05, "loss": 0.6342, "num_input_tokens_seen": 2400016, "step": 7135 }, { "epoch": 5.5177743431221025, "grad_norm": 1.26920485496521, "learning_rate": 4.543616125456179e-05, "loss": 0.5179, "num_input_tokens_seen": 2401776, "step": 7140 }, { "epoch": 5.521638330757342, "grad_norm": 0.7309451699256897, "learning_rate": 4.5426445270336446e-05, "loss": 0.4605, "num_input_tokens_seen": 2403568, "step": 7145 }, { "epoch": 5.525502318392581, "grad_norm": 0.7285791635513306, "learning_rate": 4.5416719996041466e-05, "loss": 0.3984, "num_input_tokens_seen": 2405168, "step": 7150 }, { "epoch": 5.52936630602782, "grad_norm": 0.8479931950569153, "learning_rate": 4.5406985436099954e-05, "loss": 0.5119, "num_input_tokens_seen": 2406960, "step": 7155 }, { "epoch": 5.533230293663061, "grad_norm": 0.6150367856025696, "learning_rate": 4.539724159493926e-05, "loss": 0.434, "num_input_tokens_seen": 2408592, "step": 7160 }, { "epoch": 5.5370942812983, "grad_norm": 0.6897862553596497, "learning_rate": 4.538748847699092e-05, "loss": 0.4689, "num_input_tokens_seen": 2410128, "step": 7165 }, { "epoch": 5.540958268933539, "grad_norm": 1.3029837608337402, "learning_rate": 4.537772608669074e-05, "loss": 0.4733, "num_input_tokens_seen": 2411952, "step": 7170 }, { "epoch": 5.544822256568779, "grad_norm": 0.9185271859169006, "learning_rate": 4.5367954428478695e-05, "loss": 0.4529, "num_input_tokens_seen": 2413872, "step": 7175 }, { "epoch": 5.548686244204019, "grad_norm": 1.3204549551010132, "learning_rate": 4.5358173506799e-05, "loss": 0.7485, "num_input_tokens_seen": 2415440, "step": 7180 }, { "epoch": 5.552550231839258, "grad_norm": 1.0789564847946167, "learning_rate": 4.5348383326100076e-05, "loss": 0.5127, "num_input_tokens_seen": 2416976, "step": 7185 }, { "epoch": 5.556414219474497, "grad_norm": 0.7880880236625671, "learning_rate": 4.533858389083454e-05, "loss": 0.4186, "num_input_tokens_seen": 2418576, "step": 7190 }, { "epoch": 5.560278207109738, "grad_norm": 2.389390230178833, "learning_rate": 4.5328775205459256e-05, "loss": 0.5292, "num_input_tokens_seen": 2420080, "step": 7195 }, { "epoch": 5.564142194744977, "grad_norm": 1.005128026008606, "learning_rate": 4.5318957274435266e-05, "loss": 0.595, "num_input_tokens_seen": 2421840, "step": 7200 }, { "epoch": 5.568006182380216, "grad_norm": 1.2019304037094116, "learning_rate": 4.530913010222782e-05, "loss": 0.4612, "num_input_tokens_seen": 2423664, "step": 7205 }, { "epoch": 5.571870170015456, "grad_norm": 0.5076433420181274, "learning_rate": 4.529929369330638e-05, "loss": 0.3872, "num_input_tokens_seen": 2425328, "step": 7210 }, { "epoch": 5.575734157650696, "grad_norm": 0.7608737945556641, "learning_rate": 4.528944805214459e-05, "loss": 0.4805, "num_input_tokens_seen": 2426800, "step": 7215 }, { "epoch": 5.579598145285935, "grad_norm": 0.9959830641746521, "learning_rate": 4.527959318322033e-05, "loss": 0.4236, "num_input_tokens_seen": 2428368, "step": 7220 }, { "epoch": 5.583462132921174, "grad_norm": 0.5430702567100525, "learning_rate": 4.526972909101563e-05, "loss": 0.5533, "num_input_tokens_seen": 2429904, "step": 7225 }, { "epoch": 5.587326120556414, "grad_norm": 0.9637815952301025, "learning_rate": 4.525985578001676e-05, "loss": 0.4361, "num_input_tokens_seen": 2431344, "step": 7230 }, { "epoch": 5.591190108191654, "grad_norm": 0.7297855615615845, "learning_rate": 4.524997325471414e-05, "loss": 0.4716, "num_input_tokens_seen": 2433200, "step": 7235 }, { "epoch": 5.595054095826893, "grad_norm": 0.7529439926147461, "learning_rate": 4.5240081519602416e-05, "loss": 0.6256, "num_input_tokens_seen": 2435088, "step": 7240 }, { "epoch": 5.598918083462133, "grad_norm": 1.4958518743515015, "learning_rate": 4.5230180579180405e-05, "loss": 0.6738, "num_input_tokens_seen": 2436880, "step": 7245 }, { "epoch": 5.602782071097373, "grad_norm": 0.5577096343040466, "learning_rate": 4.5220270437951104e-05, "loss": 0.4511, "num_input_tokens_seen": 2438608, "step": 7250 }, { "epoch": 5.606646058732612, "grad_norm": 0.7573778033256531, "learning_rate": 4.521035110042172e-05, "loss": 0.6251, "num_input_tokens_seen": 2440272, "step": 7255 }, { "epoch": 5.6105100463678514, "grad_norm": 1.1705173254013062, "learning_rate": 4.5200422571103625e-05, "loss": 1.0041, "num_input_tokens_seen": 2441840, "step": 7260 }, { "epoch": 5.614374034003092, "grad_norm": 0.48653170466423035, "learning_rate": 4.519048485451236e-05, "loss": 0.4219, "num_input_tokens_seen": 2443472, "step": 7265 }, { "epoch": 5.618238021638331, "grad_norm": 0.7402019500732422, "learning_rate": 4.518053795516768e-05, "loss": 0.4827, "num_input_tokens_seen": 2444848, "step": 7270 }, { "epoch": 5.62210200927357, "grad_norm": 0.5138487815856934, "learning_rate": 4.517058187759347e-05, "loss": 0.5237, "num_input_tokens_seen": 2446352, "step": 7275 }, { "epoch": 5.62596599690881, "grad_norm": 0.8667433857917786, "learning_rate": 4.5160616626317825e-05, "loss": 0.5126, "num_input_tokens_seen": 2447888, "step": 7280 }, { "epoch": 5.62982998454405, "grad_norm": 0.6289629936218262, "learning_rate": 4.515064220587301e-05, "loss": 0.5525, "num_input_tokens_seen": 2449424, "step": 7285 }, { "epoch": 5.633693972179289, "grad_norm": 0.7476175427436829, "learning_rate": 4.5140658620795426e-05, "loss": 0.4234, "num_input_tokens_seen": 2451056, "step": 7290 }, { "epoch": 5.6375579598145285, "grad_norm": 1.1606649160385132, "learning_rate": 4.51306658756257e-05, "loss": 0.5065, "num_input_tokens_seen": 2452464, "step": 7295 }, { "epoch": 5.641421947449768, "grad_norm": 1.1065298318862915, "learning_rate": 4.512066397490857e-05, "loss": 0.5519, "num_input_tokens_seen": 2454096, "step": 7300 }, { "epoch": 5.645285935085008, "grad_norm": 0.8001560568809509, "learning_rate": 4.511065292319296e-05, "loss": 0.5722, "num_input_tokens_seen": 2455632, "step": 7305 }, { "epoch": 5.649149922720247, "grad_norm": 0.7742248773574829, "learning_rate": 4.510063272503198e-05, "loss": 0.4035, "num_input_tokens_seen": 2457136, "step": 7310 }, { "epoch": 5.653013910355487, "grad_norm": 1.1726938486099243, "learning_rate": 4.5090603384982844e-05, "loss": 0.7875, "num_input_tokens_seen": 2458672, "step": 7315 }, { "epoch": 5.656877897990727, "grad_norm": 0.8123732805252075, "learning_rate": 4.508056490760697e-05, "loss": 0.4194, "num_input_tokens_seen": 2460176, "step": 7320 }, { "epoch": 5.660741885625966, "grad_norm": 0.9207563996315002, "learning_rate": 4.507051729746993e-05, "loss": 0.4214, "num_input_tokens_seen": 2462032, "step": 7325 }, { "epoch": 5.6646058732612055, "grad_norm": 0.7138177156448364, "learning_rate": 4.5060460559141414e-05, "loss": 0.417, "num_input_tokens_seen": 2463664, "step": 7330 }, { "epoch": 5.668469860896445, "grad_norm": 0.6845024228096008, "learning_rate": 4.5050394697195294e-05, "loss": 0.4169, "num_input_tokens_seen": 2465264, "step": 7335 }, { "epoch": 5.672333848531685, "grad_norm": 1.7606399059295654, "learning_rate": 4.5040319716209605e-05, "loss": 0.6893, "num_input_tokens_seen": 2467152, "step": 7340 }, { "epoch": 5.676197836166924, "grad_norm": 0.7729595899581909, "learning_rate": 4.503023562076648e-05, "loss": 0.4037, "num_input_tokens_seen": 2468784, "step": 7345 }, { "epoch": 5.680061823802164, "grad_norm": 0.8606013655662537, "learning_rate": 4.502014241545225e-05, "loss": 0.4488, "num_input_tokens_seen": 2470544, "step": 7350 }, { "epoch": 5.683925811437403, "grad_norm": 1.5773564577102661, "learning_rate": 4.501004010485734e-05, "loss": 0.796, "num_input_tokens_seen": 2472368, "step": 7355 }, { "epoch": 5.687789799072643, "grad_norm": 0.8366996049880981, "learning_rate": 4.499992869357637e-05, "loss": 0.5707, "num_input_tokens_seen": 2474128, "step": 7360 }, { "epoch": 5.6916537867078825, "grad_norm": 1.2759770154953003, "learning_rate": 4.498980818620804e-05, "loss": 0.6273, "num_input_tokens_seen": 2475920, "step": 7365 }, { "epoch": 5.695517774343122, "grad_norm": 1.047507643699646, "learning_rate": 4.4979678587355236e-05, "loss": 0.4375, "num_input_tokens_seen": 2477328, "step": 7370 }, { "epoch": 5.699381761978362, "grad_norm": 0.5097224712371826, "learning_rate": 4.496953990162496e-05, "loss": 0.3737, "num_input_tokens_seen": 2478864, "step": 7375 }, { "epoch": 5.703245749613601, "grad_norm": 0.8693650960922241, "learning_rate": 4.4959392133628345e-05, "loss": 0.5112, "num_input_tokens_seen": 2480656, "step": 7380 }, { "epoch": 5.707109737248841, "grad_norm": 0.6906445026397705, "learning_rate": 4.4949235287980654e-05, "loss": 0.5221, "num_input_tokens_seen": 2482416, "step": 7385 }, { "epoch": 5.710973724884081, "grad_norm": 0.9533892273902893, "learning_rate": 4.493906936930128e-05, "loss": 0.6884, "num_input_tokens_seen": 2484016, "step": 7390 }, { "epoch": 5.71483771251932, "grad_norm": 1.1177769899368286, "learning_rate": 4.492889438221375e-05, "loss": 0.4402, "num_input_tokens_seen": 2485488, "step": 7395 }, { "epoch": 5.7187017001545595, "grad_norm": 0.8155874013900757, "learning_rate": 4.491871033134571e-05, "loss": 0.4368, "num_input_tokens_seen": 2487376, "step": 7400 }, { "epoch": 5.722565687789799, "grad_norm": 0.6766989827156067, "learning_rate": 4.4908517221328915e-05, "loss": 0.3775, "num_input_tokens_seen": 2489136, "step": 7405 }, { "epoch": 5.726429675425039, "grad_norm": 0.8083646893501282, "learning_rate": 4.489831505679927e-05, "loss": 0.4173, "num_input_tokens_seen": 2490608, "step": 7410 }, { "epoch": 5.730293663060278, "grad_norm": 1.288374900817871, "learning_rate": 4.488810384239675e-05, "loss": 0.4642, "num_input_tokens_seen": 2492688, "step": 7415 }, { "epoch": 5.734157650695518, "grad_norm": 0.9273604154586792, "learning_rate": 4.487788358276552e-05, "loss": 0.5584, "num_input_tokens_seen": 2494480, "step": 7420 }, { "epoch": 5.738021638330757, "grad_norm": 0.9807952642440796, "learning_rate": 4.4867654282553784e-05, "loss": 0.478, "num_input_tokens_seen": 2496432, "step": 7425 }, { "epoch": 5.741885625965997, "grad_norm": 0.8262242674827576, "learning_rate": 4.4857415946413896e-05, "loss": 0.555, "num_input_tokens_seen": 2498256, "step": 7430 }, { "epoch": 5.7457496136012365, "grad_norm": 0.7137211561203003, "learning_rate": 4.484716857900232e-05, "loss": 0.4363, "num_input_tokens_seen": 2499920, "step": 7435 }, { "epoch": 5.749613601236476, "grad_norm": 0.5886214971542358, "learning_rate": 4.4836912184979606e-05, "loss": 0.5229, "num_input_tokens_seen": 2501744, "step": 7440 }, { "epoch": 5.753477588871716, "grad_norm": 0.9466813206672668, "learning_rate": 4.482664676901043e-05, "loss": 0.46, "num_input_tokens_seen": 2503472, "step": 7445 }, { "epoch": 5.757341576506955, "grad_norm": 1.1032521724700928, "learning_rate": 4.481637233576358e-05, "loss": 0.6501, "num_input_tokens_seen": 2504944, "step": 7450 }, { "epoch": 5.761205564142195, "grad_norm": 1.136693000793457, "learning_rate": 4.48060888899119e-05, "loss": 0.5847, "num_input_tokens_seen": 2506544, "step": 7455 }, { "epoch": 5.765069551777434, "grad_norm": 0.8674026727676392, "learning_rate": 4.4795796436132384e-05, "loss": 0.4452, "num_input_tokens_seen": 2507984, "step": 7460 }, { "epoch": 5.768933539412674, "grad_norm": 0.7829760313034058, "learning_rate": 4.47854949791061e-05, "loss": 0.5505, "num_input_tokens_seen": 2509776, "step": 7465 }, { "epoch": 5.7727975270479135, "grad_norm": 0.7993524074554443, "learning_rate": 4.477518452351821e-05, "loss": 0.5705, "num_input_tokens_seen": 2511312, "step": 7470 }, { "epoch": 5.776661514683153, "grad_norm": 1.1066961288452148, "learning_rate": 4.4764865074057974e-05, "loss": 0.4455, "num_input_tokens_seen": 2513072, "step": 7475 }, { "epoch": 5.780525502318392, "grad_norm": 0.7670343518257141, "learning_rate": 4.4754536635418725e-05, "loss": 0.5316, "num_input_tokens_seen": 2515024, "step": 7480 }, { "epoch": 5.784389489953632, "grad_norm": 0.7886584401130676, "learning_rate": 4.4744199212297914e-05, "loss": 0.5379, "num_input_tokens_seen": 2516752, "step": 7485 }, { "epoch": 5.788253477588872, "grad_norm": 0.7314439415931702, "learning_rate": 4.473385280939706e-05, "loss": 0.5832, "num_input_tokens_seen": 2518448, "step": 7490 }, { "epoch": 5.792117465224111, "grad_norm": 0.677507221698761, "learning_rate": 4.4723497431421756e-05, "loss": 0.7224, "num_input_tokens_seen": 2520144, "step": 7495 }, { "epoch": 5.795981452859351, "grad_norm": 0.6912359595298767, "learning_rate": 4.4713133083081715e-05, "loss": 0.5788, "num_input_tokens_seen": 2521744, "step": 7500 }, { "epoch": 5.7998454404945905, "grad_norm": 0.7595194578170776, "learning_rate": 4.470275976909068e-05, "loss": 0.5738, "num_input_tokens_seen": 2523440, "step": 7505 }, { "epoch": 5.80370942812983, "grad_norm": 0.45864811539649963, "learning_rate": 4.469237749416651e-05, "loss": 0.6562, "num_input_tokens_seen": 2524848, "step": 7510 }, { "epoch": 5.80757341576507, "grad_norm": 0.9869723916053772, "learning_rate": 4.4681986263031125e-05, "loss": 0.3968, "num_input_tokens_seen": 2526544, "step": 7515 }, { "epoch": 5.811437403400309, "grad_norm": 1.1269744634628296, "learning_rate": 4.467158608041051e-05, "loss": 0.4392, "num_input_tokens_seen": 2528304, "step": 7520 }, { "epoch": 5.815301391035549, "grad_norm": 0.6194765567779541, "learning_rate": 4.466117695103474e-05, "loss": 0.5046, "num_input_tokens_seen": 2530160, "step": 7525 }, { "epoch": 5.819165378670788, "grad_norm": 1.019202470779419, "learning_rate": 4.465075887963796e-05, "loss": 0.4648, "num_input_tokens_seen": 2531760, "step": 7530 }, { "epoch": 5.823029366306028, "grad_norm": 1.3859542608261108, "learning_rate": 4.464033187095834e-05, "loss": 0.5628, "num_input_tokens_seen": 2533488, "step": 7535 }, { "epoch": 5.8268933539412675, "grad_norm": 0.731984555721283, "learning_rate": 4.462989592973817e-05, "loss": 0.4073, "num_input_tokens_seen": 2535248, "step": 7540 }, { "epoch": 5.830757341576507, "grad_norm": 0.9260021448135376, "learning_rate": 4.461945106072377e-05, "loss": 0.4162, "num_input_tokens_seen": 2536720, "step": 7545 }, { "epoch": 5.834621329211746, "grad_norm": 0.5514627695083618, "learning_rate": 4.460899726866554e-05, "loss": 0.4919, "num_input_tokens_seen": 2538768, "step": 7550 }, { "epoch": 5.838485316846986, "grad_norm": 0.6039670705795288, "learning_rate": 4.459853455831791e-05, "loss": 0.4362, "num_input_tokens_seen": 2540496, "step": 7555 }, { "epoch": 5.842349304482226, "grad_norm": 0.8318453431129456, "learning_rate": 4.458806293443939e-05, "loss": 0.4154, "num_input_tokens_seen": 2542224, "step": 7560 }, { "epoch": 5.846213292117465, "grad_norm": 1.0445386171340942, "learning_rate": 4.457758240179255e-05, "loss": 0.4548, "num_input_tokens_seen": 2544080, "step": 7565 }, { "epoch": 5.850077279752705, "grad_norm": 0.8834079504013062, "learning_rate": 4.4567092965143974e-05, "loss": 0.437, "num_input_tokens_seen": 2545744, "step": 7570 }, { "epoch": 5.8539412673879445, "grad_norm": 0.8884050250053406, "learning_rate": 4.455659462926435e-05, "loss": 0.4392, "num_input_tokens_seen": 2547536, "step": 7575 }, { "epoch": 5.857805255023184, "grad_norm": 0.7521959543228149, "learning_rate": 4.454608739892836e-05, "loss": 0.4179, "num_input_tokens_seen": 2549200, "step": 7580 }, { "epoch": 5.861669242658423, "grad_norm": 0.6893994212150574, "learning_rate": 4.4535571278914765e-05, "loss": 0.4375, "num_input_tokens_seen": 2550832, "step": 7585 }, { "epoch": 5.865533230293663, "grad_norm": 0.5010923743247986, "learning_rate": 4.452504627400635e-05, "loss": 0.3941, "num_input_tokens_seen": 2552368, "step": 7590 }, { "epoch": 5.869397217928903, "grad_norm": 1.3217413425445557, "learning_rate": 4.451451238898997e-05, "loss": 0.5989, "num_input_tokens_seen": 2554032, "step": 7595 }, { "epoch": 5.873261205564142, "grad_norm": 0.6770143508911133, "learning_rate": 4.4503969628656484e-05, "loss": 0.4683, "num_input_tokens_seen": 2555728, "step": 7600 }, { "epoch": 5.877125193199381, "grad_norm": 0.8103022575378418, "learning_rate": 4.449341799780081e-05, "loss": 0.4482, "num_input_tokens_seen": 2557488, "step": 7605 }, { "epoch": 5.8809891808346215, "grad_norm": 1.2558444738388062, "learning_rate": 4.448285750122188e-05, "loss": 0.4345, "num_input_tokens_seen": 2559376, "step": 7610 }, { "epoch": 5.884853168469861, "grad_norm": 0.6677622199058533, "learning_rate": 4.44722881437227e-05, "loss": 0.4856, "num_input_tokens_seen": 2560976, "step": 7615 }, { "epoch": 5.8887171561051, "grad_norm": 1.0714313983917236, "learning_rate": 4.4461709930110236e-05, "loss": 0.4285, "num_input_tokens_seen": 2562544, "step": 7620 }, { "epoch": 5.89258114374034, "grad_norm": 0.7384278774261475, "learning_rate": 4.445112286519555e-05, "loss": 0.4647, "num_input_tokens_seen": 2564272, "step": 7625 }, { "epoch": 5.89644513137558, "grad_norm": 0.9443633556365967, "learning_rate": 4.44405269537937e-05, "loss": 0.668, "num_input_tokens_seen": 2566128, "step": 7630 }, { "epoch": 5.900309119010819, "grad_norm": 0.631790280342102, "learning_rate": 4.442992220072376e-05, "loss": 0.3421, "num_input_tokens_seen": 2567632, "step": 7635 }, { "epoch": 5.904173106646059, "grad_norm": 0.8153517842292786, "learning_rate": 4.4419308610808854e-05, "loss": 0.488, "num_input_tokens_seen": 2569488, "step": 7640 }, { "epoch": 5.9080370942812985, "grad_norm": 0.9674537777900696, "learning_rate": 4.440868618887608e-05, "loss": 0.4791, "num_input_tokens_seen": 2571312, "step": 7645 }, { "epoch": 5.911901081916538, "grad_norm": 0.8849340677261353, "learning_rate": 4.4398054939756606e-05, "loss": 0.5457, "num_input_tokens_seen": 2573040, "step": 7650 }, { "epoch": 5.915765069551777, "grad_norm": 0.8881857991218567, "learning_rate": 4.4387414868285566e-05, "loss": 0.5562, "num_input_tokens_seen": 2574960, "step": 7655 }, { "epoch": 5.919629057187017, "grad_norm": 0.4927532970905304, "learning_rate": 4.437676597930214e-05, "loss": 0.417, "num_input_tokens_seen": 2576784, "step": 7660 }, { "epoch": 5.923493044822257, "grad_norm": 0.9423977136611938, "learning_rate": 4.436610827764951e-05, "loss": 0.4043, "num_input_tokens_seen": 2578384, "step": 7665 }, { "epoch": 5.927357032457496, "grad_norm": 0.707086443901062, "learning_rate": 4.435544176817484e-05, "loss": 0.3448, "num_input_tokens_seen": 2579888, "step": 7670 }, { "epoch": 5.931221020092735, "grad_norm": 1.4712990522384644, "learning_rate": 4.4344766455729357e-05, "loss": 0.3999, "num_input_tokens_seen": 2581520, "step": 7675 }, { "epoch": 5.9350850077279755, "grad_norm": 0.8051708340644836, "learning_rate": 4.433408234516823e-05, "loss": 0.7419, "num_input_tokens_seen": 2582928, "step": 7680 }, { "epoch": 5.938948995363215, "grad_norm": 0.6360947489738464, "learning_rate": 4.4323389441350664e-05, "loss": 0.3951, "num_input_tokens_seen": 2584784, "step": 7685 }, { "epoch": 5.942812982998454, "grad_norm": 0.9470600485801697, "learning_rate": 4.4312687749139857e-05, "loss": 0.5004, "num_input_tokens_seen": 2586416, "step": 7690 }, { "epoch": 5.946676970633694, "grad_norm": 1.5433039665222168, "learning_rate": 4.4301977273403005e-05, "loss": 0.6971, "num_input_tokens_seen": 2588080, "step": 7695 }, { "epoch": 5.950540958268934, "grad_norm": 0.8978313207626343, "learning_rate": 4.4291258019011294e-05, "loss": 0.5107, "num_input_tokens_seen": 2589584, "step": 7700 }, { "epoch": 5.954404945904173, "grad_norm": 0.8619821667671204, "learning_rate": 4.42805299908399e-05, "loss": 0.4326, "num_input_tokens_seen": 2591568, "step": 7705 }, { "epoch": 5.958268933539412, "grad_norm": 0.5083600282669067, "learning_rate": 4.426979319376801e-05, "loss": 0.3749, "num_input_tokens_seen": 2593200, "step": 7710 }, { "epoch": 5.9621329211746525, "grad_norm": 1.0784833431243896, "learning_rate": 4.425904763267877e-05, "loss": 0.525, "num_input_tokens_seen": 2594896, "step": 7715 }, { "epoch": 5.965996908809892, "grad_norm": 0.4268975853919983, "learning_rate": 4.424829331245932e-05, "loss": 0.3293, "num_input_tokens_seen": 2596336, "step": 7720 }, { "epoch": 5.969860896445131, "grad_norm": 1.216881275177002, "learning_rate": 4.423753023800081e-05, "loss": 0.5166, "num_input_tokens_seen": 2598032, "step": 7725 }, { "epoch": 5.9737248840803705, "grad_norm": 0.9263426065444946, "learning_rate": 4.4226758414198325e-05, "loss": 0.4893, "num_input_tokens_seen": 2599760, "step": 7730 }, { "epoch": 5.977588871715611, "grad_norm": 0.8738150000572205, "learning_rate": 4.421597784595098e-05, "loss": 0.3612, "num_input_tokens_seen": 2601232, "step": 7735 }, { "epoch": 5.98145285935085, "grad_norm": 0.7414423227310181, "learning_rate": 4.420518853816182e-05, "loss": 0.4277, "num_input_tokens_seen": 2603024, "step": 7740 }, { "epoch": 5.985316846986089, "grad_norm": 0.8450551629066467, "learning_rate": 4.4194390495737915e-05, "loss": 0.3837, "num_input_tokens_seen": 2604528, "step": 7745 }, { "epoch": 5.9891808346213296, "grad_norm": 0.9892767667770386, "learning_rate": 4.418358372359025e-05, "loss": 0.4054, "num_input_tokens_seen": 2606224, "step": 7750 }, { "epoch": 5.993044822256569, "grad_norm": 1.0920836925506592, "learning_rate": 4.417276822663382e-05, "loss": 0.434, "num_input_tokens_seen": 2608112, "step": 7755 }, { "epoch": 5.996908809891808, "grad_norm": 0.7963327765464783, "learning_rate": 4.416194400978758e-05, "loss": 0.4999, "num_input_tokens_seen": 2609776, "step": 7760 }, { "epoch": 6.0, "eval_loss": 0.501906156539917, "eval_runtime": 6.3657, "eval_samples_per_second": 90.328, "eval_steps_per_second": 22.621, "num_input_tokens_seen": 2611168, "step": 7764 }, { "epoch": 6.0007727975270475, "grad_norm": 0.6547398567199707, "learning_rate": 4.415111107797445e-05, "loss": 0.4943, "num_input_tokens_seen": 2611648, "step": 7765 }, { "epoch": 6.004636785162288, "grad_norm": 0.7499632239341736, "learning_rate": 4.414026943612132e-05, "loss": 0.5829, "num_input_tokens_seen": 2613344, "step": 7770 }, { "epoch": 6.008500772797527, "grad_norm": 1.967673897743225, "learning_rate": 4.412941908915901e-05, "loss": 0.4634, "num_input_tokens_seen": 2614912, "step": 7775 }, { "epoch": 6.012364760432766, "grad_norm": 0.6256319880485535, "learning_rate": 4.411856004202234e-05, "loss": 0.4323, "num_input_tokens_seen": 2616480, "step": 7780 }, { "epoch": 6.016228748068007, "grad_norm": 0.6666427254676819, "learning_rate": 4.4107692299650064e-05, "loss": 0.4477, "num_input_tokens_seen": 2618016, "step": 7785 }, { "epoch": 6.020092735703246, "grad_norm": 0.971574068069458, "learning_rate": 4.4096815866984905e-05, "loss": 0.5904, "num_input_tokens_seen": 2619552, "step": 7790 }, { "epoch": 6.023956723338485, "grad_norm": 1.1984527111053467, "learning_rate": 4.408593074897352e-05, "loss": 0.5909, "num_input_tokens_seen": 2621536, "step": 7795 }, { "epoch": 6.0278207109737245, "grad_norm": 0.6896765232086182, "learning_rate": 4.407503695056653e-05, "loss": 0.4839, "num_input_tokens_seen": 2622944, "step": 7800 }, { "epoch": 6.031684698608965, "grad_norm": 0.8135150074958801, "learning_rate": 4.40641344767185e-05, "loss": 0.4154, "num_input_tokens_seen": 2624512, "step": 7805 }, { "epoch": 6.035548686244204, "grad_norm": 0.8512811660766602, "learning_rate": 4.4053223332387936e-05, "loss": 0.4672, "num_input_tokens_seen": 2626016, "step": 7810 }, { "epoch": 6.039412673879443, "grad_norm": 0.9702841639518738, "learning_rate": 4.40423035225373e-05, "loss": 0.4355, "num_input_tokens_seen": 2627904, "step": 7815 }, { "epoch": 6.043276661514684, "grad_norm": 0.714031457901001, "learning_rate": 4.403137505213297e-05, "loss": 0.4403, "num_input_tokens_seen": 2629600, "step": 7820 }, { "epoch": 6.047140649149923, "grad_norm": 0.6647945046424866, "learning_rate": 4.402043792614531e-05, "loss": 0.4655, "num_input_tokens_seen": 2631168, "step": 7825 }, { "epoch": 6.051004636785162, "grad_norm": 0.7833023071289062, "learning_rate": 4.400949214954856e-05, "loss": 0.4366, "num_input_tokens_seen": 2632672, "step": 7830 }, { "epoch": 6.0548686244204015, "grad_norm": 0.724306046962738, "learning_rate": 4.3998537727320944e-05, "loss": 0.4552, "num_input_tokens_seen": 2634400, "step": 7835 }, { "epoch": 6.058732612055642, "grad_norm": 0.7040703892707825, "learning_rate": 4.398757466444459e-05, "loss": 0.4377, "num_input_tokens_seen": 2636128, "step": 7840 }, { "epoch": 6.062596599690881, "grad_norm": 0.5759443640708923, "learning_rate": 4.397660296590556e-05, "loss": 0.6333, "num_input_tokens_seen": 2637952, "step": 7845 }, { "epoch": 6.06646058732612, "grad_norm": 0.6388139724731445, "learning_rate": 4.396562263669386e-05, "loss": 0.4745, "num_input_tokens_seen": 2639680, "step": 7850 }, { "epoch": 6.07032457496136, "grad_norm": 0.5161452889442444, "learning_rate": 4.39546336818034e-05, "loss": 0.3864, "num_input_tokens_seen": 2641376, "step": 7855 }, { "epoch": 6.0741885625966, "grad_norm": 1.3079047203063965, "learning_rate": 4.394363610623203e-05, "loss": 0.5, "num_input_tokens_seen": 2643232, "step": 7860 }, { "epoch": 6.078052550231839, "grad_norm": 0.6085163354873657, "learning_rate": 4.393262991498151e-05, "loss": 0.3891, "num_input_tokens_seen": 2645056, "step": 7865 }, { "epoch": 6.0819165378670785, "grad_norm": 1.22160005569458, "learning_rate": 4.3921615113057524e-05, "loss": 0.6278, "num_input_tokens_seen": 2646816, "step": 7870 }, { "epoch": 6.085780525502319, "grad_norm": 0.8340358138084412, "learning_rate": 4.391059170546966e-05, "loss": 0.5287, "num_input_tokens_seen": 2648512, "step": 7875 }, { "epoch": 6.089644513137558, "grad_norm": 1.1186096668243408, "learning_rate": 4.389955969723144e-05, "loss": 0.5892, "num_input_tokens_seen": 2650208, "step": 7880 }, { "epoch": 6.093508500772797, "grad_norm": 0.7915567755699158, "learning_rate": 4.3888519093360294e-05, "loss": 0.4108, "num_input_tokens_seen": 2651904, "step": 7885 }, { "epoch": 6.097372488408037, "grad_norm": 0.5914607048034668, "learning_rate": 4.387746989887753e-05, "loss": 0.4086, "num_input_tokens_seen": 2653824, "step": 7890 }, { "epoch": 6.101236476043277, "grad_norm": 0.6924408674240112, "learning_rate": 4.386641211880842e-05, "loss": 0.5735, "num_input_tokens_seen": 2655488, "step": 7895 }, { "epoch": 6.105100463678516, "grad_norm": 0.874354362487793, "learning_rate": 4.385534575818208e-05, "loss": 0.6882, "num_input_tokens_seen": 2657216, "step": 7900 }, { "epoch": 6.1089644513137555, "grad_norm": 2.0952601432800293, "learning_rate": 4.384427082203157e-05, "loss": 0.4742, "num_input_tokens_seen": 2658848, "step": 7905 }, { "epoch": 6.112828438948996, "grad_norm": 0.6683335304260254, "learning_rate": 4.383318731539384e-05, "loss": 0.4731, "num_input_tokens_seen": 2660800, "step": 7910 }, { "epoch": 6.116692426584235, "grad_norm": 0.601377546787262, "learning_rate": 4.3822095243309734e-05, "loss": 0.3626, "num_input_tokens_seen": 2662272, "step": 7915 }, { "epoch": 6.120556414219474, "grad_norm": 0.9442965984344482, "learning_rate": 4.381099461082399e-05, "loss": 0.4128, "num_input_tokens_seen": 2663872, "step": 7920 }, { "epoch": 6.124420401854714, "grad_norm": 0.8754228949546814, "learning_rate": 4.3799885422985234e-05, "loss": 0.4829, "num_input_tokens_seen": 2665568, "step": 7925 }, { "epoch": 6.128284389489954, "grad_norm": 0.6886698603630066, "learning_rate": 4.3788767684846e-05, "loss": 0.549, "num_input_tokens_seen": 2667296, "step": 7930 }, { "epoch": 6.132148377125193, "grad_norm": 1.361703634262085, "learning_rate": 4.377764140146271e-05, "loss": 0.7216, "num_input_tokens_seen": 2668896, "step": 7935 }, { "epoch": 6.1360123647604325, "grad_norm": 0.5367617607116699, "learning_rate": 4.3766506577895646e-05, "loss": 0.3819, "num_input_tokens_seen": 2670720, "step": 7940 }, { "epoch": 6.139876352395673, "grad_norm": 1.3781222105026245, "learning_rate": 4.375536321920901e-05, "loss": 0.6871, "num_input_tokens_seen": 2672384, "step": 7945 }, { "epoch": 6.143740340030912, "grad_norm": 1.5863187313079834, "learning_rate": 4.374421133047086e-05, "loss": 0.6954, "num_input_tokens_seen": 2674272, "step": 7950 }, { "epoch": 6.147604327666151, "grad_norm": 1.8408968448638916, "learning_rate": 4.373305091675314e-05, "loss": 0.4606, "num_input_tokens_seen": 2676064, "step": 7955 }, { "epoch": 6.151468315301391, "grad_norm": 0.82973712682724, "learning_rate": 4.3721881983131674e-05, "loss": 0.4258, "num_input_tokens_seen": 2677792, "step": 7960 }, { "epoch": 6.155332302936631, "grad_norm": 0.8505856990814209, "learning_rate": 4.3710704534686166e-05, "loss": 0.5245, "num_input_tokens_seen": 2679360, "step": 7965 }, { "epoch": 6.15919629057187, "grad_norm": 1.242977261543274, "learning_rate": 4.369951857650018e-05, "loss": 0.5871, "num_input_tokens_seen": 2681184, "step": 7970 }, { "epoch": 6.1630602782071096, "grad_norm": 0.542077898979187, "learning_rate": 4.368832411366115e-05, "loss": 0.3421, "num_input_tokens_seen": 2682944, "step": 7975 }, { "epoch": 6.166924265842349, "grad_norm": 1.490025281906128, "learning_rate": 4.36771211512604e-05, "loss": 0.5269, "num_input_tokens_seen": 2684736, "step": 7980 }, { "epoch": 6.170788253477589, "grad_norm": 0.7554579973220825, "learning_rate": 4.36659096943931e-05, "loss": 0.6061, "num_input_tokens_seen": 2686400, "step": 7985 }, { "epoch": 6.174652241112828, "grad_norm": 0.7387791872024536, "learning_rate": 4.365468974815828e-05, "loss": 0.4459, "num_input_tokens_seen": 2688224, "step": 7990 }, { "epoch": 6.178516228748068, "grad_norm": 0.9088202118873596, "learning_rate": 4.3643461317658846e-05, "loss": 0.5862, "num_input_tokens_seen": 2689952, "step": 7995 }, { "epoch": 6.182380216383308, "grad_norm": 1.3135286569595337, "learning_rate": 4.363222440800155e-05, "loss": 0.598, "num_input_tokens_seen": 2691680, "step": 8000 }, { "epoch": 6.186244204018547, "grad_norm": 0.965467095375061, "learning_rate": 4.3620979024297015e-05, "loss": 0.7199, "num_input_tokens_seen": 2693408, "step": 8005 }, { "epoch": 6.190108191653787, "grad_norm": 1.115605115890503, "learning_rate": 4.3609725171659696e-05, "loss": 0.6057, "num_input_tokens_seen": 2695168, "step": 8010 }, { "epoch": 6.193972179289026, "grad_norm": 0.7673617601394653, "learning_rate": 4.3598462855207935e-05, "loss": 0.4665, "num_input_tokens_seen": 2696768, "step": 8015 }, { "epoch": 6.197836166924266, "grad_norm": 1.2602213621139526, "learning_rate": 4.358719208006387e-05, "loss": 0.4192, "num_input_tokens_seen": 2698528, "step": 8020 }, { "epoch": 6.201700154559505, "grad_norm": 1.5456397533416748, "learning_rate": 4.357591285135354e-05, "loss": 0.4451, "num_input_tokens_seen": 2700096, "step": 8025 }, { "epoch": 6.205564142194745, "grad_norm": 0.9247782230377197, "learning_rate": 4.3564625174206794e-05, "loss": 0.372, "num_input_tokens_seen": 2701664, "step": 8030 }, { "epoch": 6.209428129829985, "grad_norm": 0.6514948010444641, "learning_rate": 4.355332905375734e-05, "loss": 0.4048, "num_input_tokens_seen": 2703200, "step": 8035 }, { "epoch": 6.213292117465224, "grad_norm": 0.954611599445343, "learning_rate": 4.354202449514273e-05, "loss": 0.4876, "num_input_tokens_seen": 2705120, "step": 8040 }, { "epoch": 6.217156105100464, "grad_norm": 0.6667215824127197, "learning_rate": 4.3530711503504326e-05, "loss": 0.511, "num_input_tokens_seen": 2706912, "step": 8045 }, { "epoch": 6.221020092735703, "grad_norm": 0.6991978287696838, "learning_rate": 4.351939008398736e-05, "loss": 0.4016, "num_input_tokens_seen": 2708544, "step": 8050 }, { "epoch": 6.224884080370943, "grad_norm": 0.9308050870895386, "learning_rate": 4.350806024174087e-05, "loss": 0.5865, "num_input_tokens_seen": 2710176, "step": 8055 }, { "epoch": 6.228748068006182, "grad_norm": 1.501216173171997, "learning_rate": 4.3496721981917744e-05, "loss": 0.5264, "num_input_tokens_seen": 2711936, "step": 8060 }, { "epoch": 6.232612055641422, "grad_norm": 0.8055127263069153, "learning_rate": 4.3485375309674683e-05, "loss": 0.5626, "num_input_tokens_seen": 2713824, "step": 8065 }, { "epoch": 6.236476043276662, "grad_norm": 0.9185643792152405, "learning_rate": 4.347402023017223e-05, "loss": 0.5546, "num_input_tokens_seen": 2715840, "step": 8070 }, { "epoch": 6.240340030911901, "grad_norm": 0.7002151012420654, "learning_rate": 4.3462656748574745e-05, "loss": 0.478, "num_input_tokens_seen": 2717472, "step": 8075 }, { "epoch": 6.244204018547141, "grad_norm": 0.8786668181419373, "learning_rate": 4.34512848700504e-05, "loss": 0.4006, "num_input_tokens_seen": 2719040, "step": 8080 }, { "epoch": 6.24806800618238, "grad_norm": 0.7903093099594116, "learning_rate": 4.34399045997712e-05, "loss": 0.5746, "num_input_tokens_seen": 2720896, "step": 8085 }, { "epoch": 6.25193199381762, "grad_norm": 0.5622165203094482, "learning_rate": 4.342851594291294e-05, "loss": 0.3931, "num_input_tokens_seen": 2722656, "step": 8090 }, { "epoch": 6.255795981452859, "grad_norm": 0.5489534139633179, "learning_rate": 4.341711890465528e-05, "loss": 0.447, "num_input_tokens_seen": 2724384, "step": 8095 }, { "epoch": 6.259659969088099, "grad_norm": 0.6324127912521362, "learning_rate": 4.3405713490181645e-05, "loss": 0.4445, "num_input_tokens_seen": 2726144, "step": 8100 }, { "epoch": 6.263523956723338, "grad_norm": 0.6438771486282349, "learning_rate": 4.339429970467928e-05, "loss": 0.4748, "num_input_tokens_seen": 2728032, "step": 8105 }, { "epoch": 6.267387944358578, "grad_norm": 0.9449020624160767, "learning_rate": 4.338287755333925e-05, "loss": 0.433, "num_input_tokens_seen": 2729664, "step": 8110 }, { "epoch": 6.271251931993818, "grad_norm": 1.0406758785247803, "learning_rate": 4.337144704135643e-05, "loss": 0.5547, "num_input_tokens_seen": 2731232, "step": 8115 }, { "epoch": 6.275115919629057, "grad_norm": 0.7452211976051331, "learning_rate": 4.3360008173929454e-05, "loss": 0.4702, "num_input_tokens_seen": 2732992, "step": 8120 }, { "epoch": 6.278979907264297, "grad_norm": 1.2101008892059326, "learning_rate": 4.3348560956260825e-05, "loss": 0.6988, "num_input_tokens_seen": 2734496, "step": 8125 }, { "epoch": 6.282843894899536, "grad_norm": 0.7511342763900757, "learning_rate": 4.333710539355678e-05, "loss": 0.378, "num_input_tokens_seen": 2736096, "step": 8130 }, { "epoch": 6.286707882534776, "grad_norm": 0.6226644515991211, "learning_rate": 4.332564149102739e-05, "loss": 0.3732, "num_input_tokens_seen": 2737792, "step": 8135 }, { "epoch": 6.290571870170015, "grad_norm": 0.7476410269737244, "learning_rate": 4.331416925388649e-05, "loss": 0.3416, "num_input_tokens_seen": 2739200, "step": 8140 }, { "epoch": 6.294435857805255, "grad_norm": 1.0023690462112427, "learning_rate": 4.330268868735174e-05, "loss": 0.4292, "num_input_tokens_seen": 2740672, "step": 8145 }, { "epoch": 6.298299845440495, "grad_norm": 0.8946877121925354, "learning_rate": 4.329119979664457e-05, "loss": 0.6122, "num_input_tokens_seen": 2742624, "step": 8150 }, { "epoch": 6.302163833075734, "grad_norm": 0.7392446994781494, "learning_rate": 4.327970258699019e-05, "loss": 0.5435, "num_input_tokens_seen": 2744224, "step": 8155 }, { "epoch": 6.306027820710974, "grad_norm": 1.1305716037750244, "learning_rate": 4.32681970636176e-05, "loss": 0.6275, "num_input_tokens_seen": 2746272, "step": 8160 }, { "epoch": 6.309891808346213, "grad_norm": 1.1653437614440918, "learning_rate": 4.3256683231759574e-05, "loss": 0.5699, "num_input_tokens_seen": 2747776, "step": 8165 }, { "epoch": 6.313755795981453, "grad_norm": 0.6789634227752686, "learning_rate": 4.3245161096652684e-05, "loss": 0.4235, "num_input_tokens_seen": 2749440, "step": 8170 }, { "epoch": 6.317619783616692, "grad_norm": 0.8033015131950378, "learning_rate": 4.323363066353727e-05, "loss": 0.3925, "num_input_tokens_seen": 2750880, "step": 8175 }, { "epoch": 6.321483771251932, "grad_norm": 0.7685956954956055, "learning_rate": 4.322209193765742e-05, "loss": 0.4706, "num_input_tokens_seen": 2752704, "step": 8180 }, { "epoch": 6.325347758887172, "grad_norm": 0.6978440880775452, "learning_rate": 4.321054492426103e-05, "loss": 0.5595, "num_input_tokens_seen": 2754464, "step": 8185 }, { "epoch": 6.329211746522411, "grad_norm": 0.5852271914482117, "learning_rate": 4.319898962859976e-05, "loss": 0.3473, "num_input_tokens_seen": 2756192, "step": 8190 }, { "epoch": 6.333075734157651, "grad_norm": 0.9988640546798706, "learning_rate": 4.3187426055929006e-05, "loss": 0.3986, "num_input_tokens_seen": 2757760, "step": 8195 }, { "epoch": 6.3369397217928904, "grad_norm": 1.1481468677520752, "learning_rate": 4.317585421150797e-05, "loss": 0.4751, "num_input_tokens_seen": 2759392, "step": 8200 }, { "epoch": 6.34080370942813, "grad_norm": 0.9718672633171082, "learning_rate": 4.316427410059959e-05, "loss": 0.8667, "num_input_tokens_seen": 2761184, "step": 8205 }, { "epoch": 6.344667697063369, "grad_norm": 0.5895628333091736, "learning_rate": 4.315268572847056e-05, "loss": 0.4566, "num_input_tokens_seen": 2762592, "step": 8210 }, { "epoch": 6.348531684698609, "grad_norm": 0.881648600101471, "learning_rate": 4.314108910039135e-05, "loss": 0.6254, "num_input_tokens_seen": 2764320, "step": 8215 }, { "epoch": 6.352395672333849, "grad_norm": 0.6713352799415588, "learning_rate": 4.3129484221636176e-05, "loss": 0.3978, "num_input_tokens_seen": 2766016, "step": 8220 }, { "epoch": 6.356259659969088, "grad_norm": 1.15146803855896, "learning_rate": 4.3117871097483e-05, "loss": 0.4959, "num_input_tokens_seen": 2767744, "step": 8225 }, { "epoch": 6.360123647604327, "grad_norm": 0.8171817064285278, "learning_rate": 4.310624973321355e-05, "loss": 0.4684, "num_input_tokens_seen": 2769216, "step": 8230 }, { "epoch": 6.3639876352395675, "grad_norm": 0.8079126477241516, "learning_rate": 4.309462013411328e-05, "loss": 0.5165, "num_input_tokens_seen": 2770752, "step": 8235 }, { "epoch": 6.367851622874807, "grad_norm": 1.0289394855499268, "learning_rate": 4.308298230547142e-05, "loss": 0.4693, "num_input_tokens_seen": 2772448, "step": 8240 }, { "epoch": 6.371715610510046, "grad_norm": 0.6132727861404419, "learning_rate": 4.307133625258091e-05, "loss": 0.4929, "num_input_tokens_seen": 2774304, "step": 8245 }, { "epoch": 6.375579598145286, "grad_norm": 1.0193275213241577, "learning_rate": 4.3059681980738445e-05, "loss": 0.4335, "num_input_tokens_seen": 2775840, "step": 8250 }, { "epoch": 6.379443585780526, "grad_norm": 1.0462247133255005, "learning_rate": 4.304801949524446e-05, "loss": 0.4406, "num_input_tokens_seen": 2777568, "step": 8255 }, { "epoch": 6.383307573415765, "grad_norm": 0.948837161064148, "learning_rate": 4.303634880140312e-05, "loss": 0.908, "num_input_tokens_seen": 2779360, "step": 8260 }, { "epoch": 6.387171561051004, "grad_norm": 0.800899863243103, "learning_rate": 4.302466990452233e-05, "loss": 0.6192, "num_input_tokens_seen": 2781216, "step": 8265 }, { "epoch": 6.3910355486862445, "grad_norm": 0.4937194883823395, "learning_rate": 4.301298280991373e-05, "loss": 0.4116, "num_input_tokens_seen": 2782944, "step": 8270 }, { "epoch": 6.394899536321484, "grad_norm": 0.8246301412582397, "learning_rate": 4.3001287522892665e-05, "loss": 0.5689, "num_input_tokens_seen": 2784448, "step": 8275 }, { "epoch": 6.398763523956723, "grad_norm": 0.6517672538757324, "learning_rate": 4.298958404877823e-05, "loss": 0.5891, "num_input_tokens_seen": 2786240, "step": 8280 }, { "epoch": 6.402627511591963, "grad_norm": 1.0301368236541748, "learning_rate": 4.2977872392893235e-05, "loss": 0.396, "num_input_tokens_seen": 2787712, "step": 8285 }, { "epoch": 6.406491499227203, "grad_norm": 0.7370407581329346, "learning_rate": 4.296615256056421e-05, "loss": 0.4804, "num_input_tokens_seen": 2789184, "step": 8290 }, { "epoch": 6.410355486862442, "grad_norm": 0.5161345601081848, "learning_rate": 4.295442455712141e-05, "loss": 0.3991, "num_input_tokens_seen": 2790752, "step": 8295 }, { "epoch": 6.414219474497681, "grad_norm": 0.7285977602005005, "learning_rate": 4.294268838789879e-05, "loss": 0.4374, "num_input_tokens_seen": 2792288, "step": 8300 }, { "epoch": 6.4180834621329215, "grad_norm": 0.6409057378768921, "learning_rate": 4.293094405823404e-05, "loss": 0.3777, "num_input_tokens_seen": 2793792, "step": 8305 }, { "epoch": 6.421947449768161, "grad_norm": 0.8522518873214722, "learning_rate": 4.2919191573468555e-05, "loss": 0.4403, "num_input_tokens_seen": 2795232, "step": 8310 }, { "epoch": 6.4258114374034, "grad_norm": 1.223638892173767, "learning_rate": 4.290743093894742e-05, "loss": 0.5378, "num_input_tokens_seen": 2796896, "step": 8315 }, { "epoch": 6.42967542503864, "grad_norm": 0.891741156578064, "learning_rate": 4.2895662160019444e-05, "loss": 0.5107, "num_input_tokens_seen": 2798464, "step": 8320 }, { "epoch": 6.43353941267388, "grad_norm": 0.5769285559654236, "learning_rate": 4.288388524203716e-05, "loss": 0.6113, "num_input_tokens_seen": 2800032, "step": 8325 }, { "epoch": 6.437403400309119, "grad_norm": 1.259589672088623, "learning_rate": 4.2872100190356756e-05, "loss": 0.4245, "num_input_tokens_seen": 2801568, "step": 8330 }, { "epoch": 6.441267387944358, "grad_norm": 0.5325093269348145, "learning_rate": 4.286030701033815e-05, "loss": 0.3875, "num_input_tokens_seen": 2803008, "step": 8335 }, { "epoch": 6.4451313755795985, "grad_norm": 1.0360701084136963, "learning_rate": 4.2848505707344965e-05, "loss": 0.3642, "num_input_tokens_seen": 2804480, "step": 8340 }, { "epoch": 6.448995363214838, "grad_norm": 1.1037391424179077, "learning_rate": 4.283669628674449e-05, "loss": 0.4608, "num_input_tokens_seen": 2806016, "step": 8345 }, { "epoch": 6.452859350850077, "grad_norm": 1.0436878204345703, "learning_rate": 4.282487875390772e-05, "loss": 0.4986, "num_input_tokens_seen": 2807616, "step": 8350 }, { "epoch": 6.456723338485316, "grad_norm": 0.8969153165817261, "learning_rate": 4.2813053114209345e-05, "loss": 0.4492, "num_input_tokens_seen": 2809248, "step": 8355 }, { "epoch": 6.460587326120557, "grad_norm": 0.925747811794281, "learning_rate": 4.280121937302774e-05, "loss": 0.4525, "num_input_tokens_seen": 2810848, "step": 8360 }, { "epoch": 6.464451313755796, "grad_norm": 0.5573831796646118, "learning_rate": 4.2789377535744955e-05, "loss": 0.3711, "num_input_tokens_seen": 2812512, "step": 8365 }, { "epoch": 6.468315301391035, "grad_norm": 0.8851404190063477, "learning_rate": 4.2777527607746725e-05, "loss": 0.4329, "num_input_tokens_seen": 2814016, "step": 8370 }, { "epoch": 6.4721792890262755, "grad_norm": 1.0902279615402222, "learning_rate": 4.2765669594422486e-05, "loss": 0.419, "num_input_tokens_seen": 2815552, "step": 8375 }, { "epoch": 6.476043276661515, "grad_norm": 1.1983774900436401, "learning_rate": 4.2753803501165304e-05, "loss": 0.4712, "num_input_tokens_seen": 2817216, "step": 8380 }, { "epoch": 6.479907264296754, "grad_norm": 0.6846383213996887, "learning_rate": 4.2741929333371986e-05, "loss": 0.3969, "num_input_tokens_seen": 2819040, "step": 8385 }, { "epoch": 6.483771251931993, "grad_norm": 0.8463010787963867, "learning_rate": 4.2730047096442935e-05, "loss": 0.4027, "num_input_tokens_seen": 2820800, "step": 8390 }, { "epoch": 6.487635239567234, "grad_norm": 1.1159881353378296, "learning_rate": 4.271815679578229e-05, "loss": 0.4667, "num_input_tokens_seen": 2822592, "step": 8395 }, { "epoch": 6.491499227202473, "grad_norm": 1.0065104961395264, "learning_rate": 4.270625843679783e-05, "loss": 0.9092, "num_input_tokens_seen": 2824288, "step": 8400 }, { "epoch": 6.495363214837712, "grad_norm": 0.532608151435852, "learning_rate": 4.2694352024901e-05, "loss": 0.6857, "num_input_tokens_seen": 2825952, "step": 8405 }, { "epoch": 6.4992272024729525, "grad_norm": 0.5927087664604187, "learning_rate": 4.268243756550689e-05, "loss": 0.343, "num_input_tokens_seen": 2827392, "step": 8410 }, { "epoch": 6.503091190108192, "grad_norm": 0.8389599323272705, "learning_rate": 4.267051506403428e-05, "loss": 0.3831, "num_input_tokens_seen": 2828992, "step": 8415 }, { "epoch": 6.506955177743431, "grad_norm": 0.5219050049781799, "learning_rate": 4.26585845259056e-05, "loss": 0.3724, "num_input_tokens_seen": 2830720, "step": 8420 }, { "epoch": 6.5108191653786704, "grad_norm": 0.8296083211898804, "learning_rate": 4.264664595654692e-05, "loss": 0.452, "num_input_tokens_seen": 2832416, "step": 8425 }, { "epoch": 6.514683153013911, "grad_norm": 0.6716732382774353, "learning_rate": 4.263469936138797e-05, "loss": 0.5577, "num_input_tokens_seen": 2834080, "step": 8430 }, { "epoch": 6.51854714064915, "grad_norm": 1.4134224653244019, "learning_rate": 4.2622744745862154e-05, "loss": 0.7568, "num_input_tokens_seen": 2835680, "step": 8435 }, { "epoch": 6.522411128284389, "grad_norm": 0.6943607330322266, "learning_rate": 4.2610782115406483e-05, "loss": 0.4187, "num_input_tokens_seen": 2837216, "step": 8440 }, { "epoch": 6.5262751159196295, "grad_norm": 0.9872038960456848, "learning_rate": 4.259881147546164e-05, "loss": 0.744, "num_input_tokens_seen": 2839104, "step": 8445 }, { "epoch": 6.530139103554869, "grad_norm": 1.3180166482925415, "learning_rate": 4.258683283147195e-05, "loss": 0.4385, "num_input_tokens_seen": 2841088, "step": 8450 }, { "epoch": 6.534003091190108, "grad_norm": 1.8412511348724365, "learning_rate": 4.2574846188885356e-05, "loss": 0.62, "num_input_tokens_seen": 2842944, "step": 8455 }, { "epoch": 6.5378670788253475, "grad_norm": 0.6463793516159058, "learning_rate": 4.256285155315346e-05, "loss": 0.4642, "num_input_tokens_seen": 2844480, "step": 8460 }, { "epoch": 6.541731066460588, "grad_norm": 0.8249832987785339, "learning_rate": 4.25508489297315e-05, "loss": 0.4044, "num_input_tokens_seen": 2846112, "step": 8465 }, { "epoch": 6.545595054095827, "grad_norm": 0.6139340400695801, "learning_rate": 4.253883832407835e-05, "loss": 0.5331, "num_input_tokens_seen": 2847936, "step": 8470 }, { "epoch": 6.549459041731066, "grad_norm": 0.6586691737174988, "learning_rate": 4.2526819741656485e-05, "loss": 0.3714, "num_input_tokens_seen": 2849376, "step": 8475 }, { "epoch": 6.553323029366306, "grad_norm": 0.7538725137710571, "learning_rate": 4.2514793187932036e-05, "loss": 0.4892, "num_input_tokens_seen": 2851040, "step": 8480 }, { "epoch": 6.557187017001546, "grad_norm": 0.6587100625038147, "learning_rate": 4.250275866837475e-05, "loss": 0.4766, "num_input_tokens_seen": 2852736, "step": 8485 }, { "epoch": 6.561051004636785, "grad_norm": 0.41870737075805664, "learning_rate": 4.2490716188458014e-05, "loss": 0.3436, "num_input_tokens_seen": 2854400, "step": 8490 }, { "epoch": 6.5649149922720245, "grad_norm": 0.8886138796806335, "learning_rate": 4.2478665753658794e-05, "loss": 0.5863, "num_input_tokens_seen": 2856096, "step": 8495 }, { "epoch": 6.568778979907265, "grad_norm": 1.2264467477798462, "learning_rate": 4.246660736945773e-05, "loss": 0.3565, "num_input_tokens_seen": 2857664, "step": 8500 }, { "epoch": 6.572642967542504, "grad_norm": 0.6897282004356384, "learning_rate": 4.2454541041339027e-05, "loss": 0.4296, "num_input_tokens_seen": 2859456, "step": 8505 }, { "epoch": 6.576506955177743, "grad_norm": 1.350389003753662, "learning_rate": 4.2442466774790516e-05, "loss": 0.4734, "num_input_tokens_seen": 2861216, "step": 8510 }, { "epoch": 6.580370942812983, "grad_norm": 1.1808985471725464, "learning_rate": 4.243038457530366e-05, "loss": 0.5784, "num_input_tokens_seen": 2862976, "step": 8515 }, { "epoch": 6.584234930448223, "grad_norm": 0.643854022026062, "learning_rate": 4.241829444837352e-05, "loss": 0.3486, "num_input_tokens_seen": 2864576, "step": 8520 }, { "epoch": 6.588098918083462, "grad_norm": 0.8017864227294922, "learning_rate": 4.240619639949874e-05, "loss": 0.3874, "num_input_tokens_seen": 2866112, "step": 8525 }, { "epoch": 6.5919629057187015, "grad_norm": 0.6235646605491638, "learning_rate": 4.239409043418161e-05, "loss": 0.4386, "num_input_tokens_seen": 2867840, "step": 8530 }, { "epoch": 6.595826893353941, "grad_norm": 0.663729190826416, "learning_rate": 4.2381976557927974e-05, "loss": 0.4179, "num_input_tokens_seen": 2869440, "step": 8535 }, { "epoch": 6.599690880989181, "grad_norm": 0.5231285095214844, "learning_rate": 4.2369854776247295e-05, "loss": 0.5553, "num_input_tokens_seen": 2871296, "step": 8540 }, { "epoch": 6.60355486862442, "grad_norm": 0.9751484990119934, "learning_rate": 4.235772509465266e-05, "loss": 0.4729, "num_input_tokens_seen": 2873088, "step": 8545 }, { "epoch": 6.60741885625966, "grad_norm": 0.8297590017318726, "learning_rate": 4.234558751866068e-05, "loss": 0.5202, "num_input_tokens_seen": 2874976, "step": 8550 }, { "epoch": 6.6112828438949, "grad_norm": 0.8475916981697083, "learning_rate": 4.2333442053791625e-05, "loss": 0.4553, "num_input_tokens_seen": 2876672, "step": 8555 }, { "epoch": 6.615146831530139, "grad_norm": 1.40280282497406, "learning_rate": 4.2321288705569315e-05, "loss": 0.6909, "num_input_tokens_seen": 2878176, "step": 8560 }, { "epoch": 6.6190108191653785, "grad_norm": 0.9863243699073792, "learning_rate": 4.230912747952118e-05, "loss": 0.5775, "num_input_tokens_seen": 2879904, "step": 8565 }, { "epoch": 6.622874806800619, "grad_norm": 1.1192681789398193, "learning_rate": 4.22969583811782e-05, "loss": 0.4574, "num_input_tokens_seen": 2881568, "step": 8570 }, { "epoch": 6.626738794435858, "grad_norm": 0.7079848647117615, "learning_rate": 4.228478141607496e-05, "loss": 0.3835, "num_input_tokens_seen": 2883296, "step": 8575 }, { "epoch": 6.630602782071097, "grad_norm": 0.6072793006896973, "learning_rate": 4.227259658974961e-05, "loss": 0.4536, "num_input_tokens_seen": 2884896, "step": 8580 }, { "epoch": 6.634466769706337, "grad_norm": 1.699386715888977, "learning_rate": 4.2260403907743906e-05, "loss": 0.4658, "num_input_tokens_seen": 2886528, "step": 8585 }, { "epoch": 6.638330757341577, "grad_norm": 0.6360980272293091, "learning_rate": 4.224820337560313e-05, "loss": 0.4955, "num_input_tokens_seen": 2888288, "step": 8590 }, { "epoch": 6.642194744976816, "grad_norm": 1.0080368518829346, "learning_rate": 4.2235994998876156e-05, "loss": 0.4719, "num_input_tokens_seen": 2889888, "step": 8595 }, { "epoch": 6.6460587326120555, "grad_norm": 0.675884485244751, "learning_rate": 4.222377878311544e-05, "loss": 0.6753, "num_input_tokens_seen": 2891904, "step": 8600 }, { "epoch": 6.649922720247295, "grad_norm": 0.6530839800834656, "learning_rate": 4.2211554733876984e-05, "loss": 0.4301, "num_input_tokens_seen": 2893664, "step": 8605 }, { "epoch": 6.653786707882535, "grad_norm": 1.0433281660079956, "learning_rate": 4.2199322856720356e-05, "loss": 0.4519, "num_input_tokens_seen": 2895488, "step": 8610 }, { "epoch": 6.657650695517774, "grad_norm": 0.8296135067939758, "learning_rate": 4.218708315720869e-05, "loss": 0.4162, "num_input_tokens_seen": 2897280, "step": 8615 }, { "epoch": 6.661514683153014, "grad_norm": 0.6767705678939819, "learning_rate": 4.217483564090868e-05, "loss": 0.4876, "num_input_tokens_seen": 2898944, "step": 8620 }, { "epoch": 6.665378670788254, "grad_norm": 0.6037458181381226, "learning_rate": 4.216258031339056e-05, "loss": 0.5308, "num_input_tokens_seen": 2900544, "step": 8625 }, { "epoch": 6.669242658423493, "grad_norm": 0.8032059073448181, "learning_rate": 4.2150317180228135e-05, "loss": 0.4397, "num_input_tokens_seen": 2902048, "step": 8630 }, { "epoch": 6.6731066460587325, "grad_norm": 0.8339233994483948, "learning_rate": 4.2138046246998746e-05, "loss": 0.5138, "num_input_tokens_seen": 2903584, "step": 8635 }, { "epoch": 6.676970633693972, "grad_norm": 0.8203127980232239, "learning_rate": 4.2125767519283285e-05, "loss": 0.5014, "num_input_tokens_seen": 2905024, "step": 8640 }, { "epoch": 6.680834621329212, "grad_norm": 0.679129958152771, "learning_rate": 4.21134810026662e-05, "loss": 0.7371, "num_input_tokens_seen": 2906720, "step": 8645 }, { "epoch": 6.684698608964451, "grad_norm": 0.8255099654197693, "learning_rate": 4.210118670273546e-05, "loss": 0.4701, "num_input_tokens_seen": 2908224, "step": 8650 }, { "epoch": 6.688562596599691, "grad_norm": 0.7212280035018921, "learning_rate": 4.20888846250826e-05, "loss": 0.4469, "num_input_tokens_seen": 2910272, "step": 8655 }, { "epoch": 6.69242658423493, "grad_norm": 0.880824089050293, "learning_rate": 4.2076574775302665e-05, "loss": 0.4659, "num_input_tokens_seen": 2911808, "step": 8660 }, { "epoch": 6.69629057187017, "grad_norm": 0.6419463157653809, "learning_rate": 4.206425715899425e-05, "loss": 0.3997, "num_input_tokens_seen": 2913600, "step": 8665 }, { "epoch": 6.7001545595054095, "grad_norm": 0.9080177545547485, "learning_rate": 4.205193178175949e-05, "loss": 0.4867, "num_input_tokens_seen": 2915296, "step": 8670 }, { "epoch": 6.704018547140649, "grad_norm": 0.9353747367858887, "learning_rate": 4.203959864920404e-05, "loss": 0.4439, "num_input_tokens_seen": 2917120, "step": 8675 }, { "epoch": 6.707882534775889, "grad_norm": 0.6356704235076904, "learning_rate": 4.202725776693707e-05, "loss": 0.5709, "num_input_tokens_seen": 2918816, "step": 8680 }, { "epoch": 6.711746522411128, "grad_norm": 0.8049728274345398, "learning_rate": 4.2014909140571305e-05, "loss": 0.48, "num_input_tokens_seen": 2920512, "step": 8685 }, { "epoch": 6.715610510046368, "grad_norm": 0.6948986649513245, "learning_rate": 4.2002552775722956e-05, "loss": 0.731, "num_input_tokens_seen": 2922304, "step": 8690 }, { "epoch": 6.719474497681608, "grad_norm": 1.0989042520523071, "learning_rate": 4.199018867801179e-05, "loss": 0.6692, "num_input_tokens_seen": 2924064, "step": 8695 }, { "epoch": 6.723338485316847, "grad_norm": 0.6811407804489136, "learning_rate": 4.197781685306105e-05, "loss": 0.3817, "num_input_tokens_seen": 2925728, "step": 8700 }, { "epoch": 6.7272024729520865, "grad_norm": 1.2502433061599731, "learning_rate": 4.196543730649754e-05, "loss": 0.4007, "num_input_tokens_seen": 2927232, "step": 8705 }, { "epoch": 6.731066460587326, "grad_norm": 0.8550954461097717, "learning_rate": 4.1953050043951537e-05, "loss": 0.5381, "num_input_tokens_seen": 2928800, "step": 8710 }, { "epoch": 6.734930448222566, "grad_norm": 0.6955838799476624, "learning_rate": 4.194065507105685e-05, "loss": 0.43, "num_input_tokens_seen": 2930496, "step": 8715 }, { "epoch": 6.738794435857805, "grad_norm": 0.8602291941642761, "learning_rate": 4.192825239345077e-05, "loss": 0.5025, "num_input_tokens_seen": 2932192, "step": 8720 }, { "epoch": 6.742658423493045, "grad_norm": 0.7912707924842834, "learning_rate": 4.191584201677414e-05, "loss": 0.444, "num_input_tokens_seen": 2933728, "step": 8725 }, { "epoch": 6.746522411128284, "grad_norm": 0.6466251015663147, "learning_rate": 4.190342394667124e-05, "loss": 0.421, "num_input_tokens_seen": 2935456, "step": 8730 }, { "epoch": 6.750386398763524, "grad_norm": 0.7399994134902954, "learning_rate": 4.189099818878991e-05, "loss": 0.511, "num_input_tokens_seen": 2937216, "step": 8735 }, { "epoch": 6.7542503863987635, "grad_norm": 0.9097562432289124, "learning_rate": 4.1878564748781446e-05, "loss": 0.3854, "num_input_tokens_seen": 2938944, "step": 8740 }, { "epoch": 6.758114374034003, "grad_norm": 1.335138201713562, "learning_rate": 4.186612363230065e-05, "loss": 0.4676, "num_input_tokens_seen": 2940576, "step": 8745 }, { "epoch": 6.761978361669243, "grad_norm": 0.6712969541549683, "learning_rate": 4.185367484500582e-05, "loss": 0.3961, "num_input_tokens_seen": 2942176, "step": 8750 }, { "epoch": 6.765842349304482, "grad_norm": 1.1958693265914917, "learning_rate": 4.184121839255873e-05, "loss": 0.4395, "num_input_tokens_seen": 2943936, "step": 8755 }, { "epoch": 6.769706336939722, "grad_norm": 0.7162715196609497, "learning_rate": 4.182875428062467e-05, "loss": 0.3689, "num_input_tokens_seen": 2945472, "step": 8760 }, { "epoch": 6.773570324574961, "grad_norm": 0.7525103092193604, "learning_rate": 4.181628251487237e-05, "loss": 0.3454, "num_input_tokens_seen": 2946848, "step": 8765 }, { "epoch": 6.777434312210201, "grad_norm": 0.8107184171676636, "learning_rate": 4.1803803100974075e-05, "loss": 0.3935, "num_input_tokens_seen": 2948512, "step": 8770 }, { "epoch": 6.7812982998454405, "grad_norm": 0.7343987822532654, "learning_rate": 4.17913160446055e-05, "loss": 0.3569, "num_input_tokens_seen": 2949952, "step": 8775 }, { "epoch": 6.78516228748068, "grad_norm": 0.8986130356788635, "learning_rate": 4.177882135144582e-05, "loss": 0.4123, "num_input_tokens_seen": 2951872, "step": 8780 }, { "epoch": 6.789026275115919, "grad_norm": 0.6082340478897095, "learning_rate": 4.1766319027177715e-05, "loss": 0.7388, "num_input_tokens_seen": 2954016, "step": 8785 }, { "epoch": 6.792890262751159, "grad_norm": 1.07297682762146, "learning_rate": 4.1753809077487304e-05, "loss": 0.5204, "num_input_tokens_seen": 2955712, "step": 8790 }, { "epoch": 6.796754250386399, "grad_norm": 0.8461518287658691, "learning_rate": 4.174129150806419e-05, "loss": 0.3859, "num_input_tokens_seen": 2957152, "step": 8795 }, { "epoch": 6.800618238021638, "grad_norm": 1.1750448942184448, "learning_rate": 4.172876632460143e-05, "loss": 0.9436, "num_input_tokens_seen": 2958880, "step": 8800 }, { "epoch": 6.804482225656878, "grad_norm": 1.2734897136688232, "learning_rate": 4.1716233532795564e-05, "loss": 0.7641, "num_input_tokens_seen": 2960448, "step": 8805 }, { "epoch": 6.8083462132921175, "grad_norm": 0.8206815123558044, "learning_rate": 4.170369313834659e-05, "loss": 0.4009, "num_input_tokens_seen": 2962176, "step": 8810 }, { "epoch": 6.812210200927357, "grad_norm": 1.4046002626419067, "learning_rate": 4.1691145146957934e-05, "loss": 0.5692, "num_input_tokens_seen": 2963872, "step": 8815 }, { "epoch": 6.816074188562597, "grad_norm": 0.5591655373573303, "learning_rate": 4.16785895643365e-05, "loss": 0.4671, "num_input_tokens_seen": 2965600, "step": 8820 }, { "epoch": 6.819938176197836, "grad_norm": 1.410979986190796, "learning_rate": 4.1666026396192656e-05, "loss": 0.4288, "num_input_tokens_seen": 2967424, "step": 8825 }, { "epoch": 6.823802163833076, "grad_norm": 1.644108533859253, "learning_rate": 4.16534556482402e-05, "loss": 0.6636, "num_input_tokens_seen": 2969120, "step": 8830 }, { "epoch": 6.827666151468315, "grad_norm": 1.6204893589019775, "learning_rate": 4.164087732619637e-05, "loss": 0.5457, "num_input_tokens_seen": 2970816, "step": 8835 }, { "epoch": 6.831530139103555, "grad_norm": 0.7978081703186035, "learning_rate": 4.162829143578189e-05, "loss": 0.5923, "num_input_tokens_seen": 2972576, "step": 8840 }, { "epoch": 6.8353941267387945, "grad_norm": 1.0850142240524292, "learning_rate": 4.161569798272087e-05, "loss": 0.5706, "num_input_tokens_seen": 2974304, "step": 8845 }, { "epoch": 6.839258114374034, "grad_norm": 0.9883841276168823, "learning_rate": 4.16030969727409e-05, "loss": 0.4758, "num_input_tokens_seen": 2975936, "step": 8850 }, { "epoch": 6.843122102009273, "grad_norm": 0.8513297438621521, "learning_rate": 4.1590488411573006e-05, "loss": 0.6544, "num_input_tokens_seen": 2977536, "step": 8855 }, { "epoch": 6.846986089644513, "grad_norm": 0.9638069868087769, "learning_rate": 4.157787230495161e-05, "loss": 0.463, "num_input_tokens_seen": 2979040, "step": 8860 }, { "epoch": 6.850850077279753, "grad_norm": 0.7101118564605713, "learning_rate": 4.156524865861462e-05, "loss": 0.3804, "num_input_tokens_seen": 2980704, "step": 8865 }, { "epoch": 6.854714064914992, "grad_norm": 0.5546497106552124, "learning_rate": 4.155261747830332e-05, "loss": 0.4775, "num_input_tokens_seen": 2982400, "step": 8870 }, { "epoch": 6.858578052550232, "grad_norm": 0.751453161239624, "learning_rate": 4.153997876976248e-05, "loss": 0.3605, "num_input_tokens_seen": 2983968, "step": 8875 }, { "epoch": 6.8624420401854715, "grad_norm": 1.5033479928970337, "learning_rate": 4.152733253874023e-05, "loss": 0.6667, "num_input_tokens_seen": 2985408, "step": 8880 }, { "epoch": 6.866306027820711, "grad_norm": 0.9348111748695374, "learning_rate": 4.151467879098817e-05, "loss": 0.392, "num_input_tokens_seen": 2987264, "step": 8885 }, { "epoch": 6.87017001545595, "grad_norm": 1.1851354837417603, "learning_rate": 4.150201753226129e-05, "loss": 0.4683, "num_input_tokens_seen": 2988896, "step": 8890 }, { "epoch": 6.87403400309119, "grad_norm": 0.6537526249885559, "learning_rate": 4.148934876831801e-05, "loss": 0.3915, "num_input_tokens_seen": 2990592, "step": 8895 }, { "epoch": 6.87789799072643, "grad_norm": 0.5727382302284241, "learning_rate": 4.1476672504920154e-05, "loss": 0.6175, "num_input_tokens_seen": 2992064, "step": 8900 }, { "epoch": 6.881761978361669, "grad_norm": 2.2557456493377686, "learning_rate": 4.146398874783297e-05, "loss": 0.5011, "num_input_tokens_seen": 2993824, "step": 8905 }, { "epoch": 6.885625965996908, "grad_norm": 0.8918333649635315, "learning_rate": 4.1451297502825116e-05, "loss": 0.3844, "num_input_tokens_seen": 2995520, "step": 8910 }, { "epoch": 6.8894899536321486, "grad_norm": 0.8950200080871582, "learning_rate": 4.143859877566863e-05, "loss": 0.4385, "num_input_tokens_seen": 2997472, "step": 8915 }, { "epoch": 6.893353941267388, "grad_norm": 1.2720222473144531, "learning_rate": 4.1425892572138966e-05, "loss": 0.4211, "num_input_tokens_seen": 2999008, "step": 8920 }, { "epoch": 6.897217928902627, "grad_norm": 0.8748270869255066, "learning_rate": 4.141317889801499e-05, "loss": 0.4442, "num_input_tokens_seen": 3000768, "step": 8925 }, { "epoch": 6.901081916537867, "grad_norm": 1.347531795501709, "learning_rate": 4.140045775907896e-05, "loss": 0.3619, "num_input_tokens_seen": 3002336, "step": 8930 }, { "epoch": 6.904945904173107, "grad_norm": 0.9970714449882507, "learning_rate": 4.138772916111653e-05, "loss": 0.3712, "num_input_tokens_seen": 3003744, "step": 8935 }, { "epoch": 6.908809891808346, "grad_norm": 0.9361785650253296, "learning_rate": 4.137499310991672e-05, "loss": 0.4777, "num_input_tokens_seen": 3005632, "step": 8940 }, { "epoch": 6.912673879443586, "grad_norm": 0.6350540518760681, "learning_rate": 4.136224961127199e-05, "loss": 0.4477, "num_input_tokens_seen": 3007456, "step": 8945 }, { "epoch": 6.916537867078826, "grad_norm": 0.8053473234176636, "learning_rate": 4.1349498670978134e-05, "loss": 0.7928, "num_input_tokens_seen": 3009408, "step": 8950 }, { "epoch": 6.920401854714065, "grad_norm": 0.7578513026237488, "learning_rate": 4.1336740294834384e-05, "loss": 0.3473, "num_input_tokens_seen": 3011232, "step": 8955 }, { "epoch": 6.924265842349304, "grad_norm": 1.234535813331604, "learning_rate": 4.13239744886433e-05, "loss": 0.3647, "num_input_tokens_seen": 3012896, "step": 8960 }, { "epoch": 6.928129829984544, "grad_norm": 1.7247563600540161, "learning_rate": 4.1311201258210867e-05, "loss": 0.4532, "num_input_tokens_seen": 3014400, "step": 8965 }, { "epoch": 6.931993817619784, "grad_norm": 0.9577203392982483, "learning_rate": 4.1298420609346414e-05, "loss": 0.5516, "num_input_tokens_seen": 3016416, "step": 8970 }, { "epoch": 6.935857805255023, "grad_norm": 1.2689354419708252, "learning_rate": 4.128563254786266e-05, "loss": 0.4531, "num_input_tokens_seen": 3018336, "step": 8975 }, { "epoch": 6.939721792890262, "grad_norm": 0.6392449140548706, "learning_rate": 4.1272837079575686e-05, "loss": 0.4191, "num_input_tokens_seen": 3020192, "step": 8980 }, { "epoch": 6.943585780525503, "grad_norm": 0.6885069608688354, "learning_rate": 4.1260034210304966e-05, "loss": 0.6636, "num_input_tokens_seen": 3021728, "step": 8985 }, { "epoch": 6.947449768160742, "grad_norm": 0.8833767771720886, "learning_rate": 4.1247223945873307e-05, "loss": 0.5026, "num_input_tokens_seen": 3023712, "step": 8990 }, { "epoch": 6.951313755795981, "grad_norm": 0.6209324598312378, "learning_rate": 4.123440629210689e-05, "loss": 0.5324, "num_input_tokens_seen": 3025280, "step": 8995 }, { "epoch": 6.955177743431221, "grad_norm": 1.1749900579452515, "learning_rate": 4.1221581254835276e-05, "loss": 0.5075, "num_input_tokens_seen": 3027040, "step": 9000 }, { "epoch": 6.959041731066461, "grad_norm": 1.0103297233581543, "learning_rate": 4.120874883989135e-05, "loss": 0.4436, "num_input_tokens_seen": 3028640, "step": 9005 }, { "epoch": 6.9629057187017, "grad_norm": 1.1096343994140625, "learning_rate": 4.1195909053111386e-05, "loss": 0.513, "num_input_tokens_seen": 3030560, "step": 9010 }, { "epoch": 6.966769706336939, "grad_norm": 0.7379075884819031, "learning_rate": 4.1183061900335e-05, "loss": 0.468, "num_input_tokens_seen": 3032064, "step": 9015 }, { "epoch": 6.97063369397218, "grad_norm": 0.7314010858535767, "learning_rate": 4.117020738740512e-05, "loss": 0.717, "num_input_tokens_seen": 3033664, "step": 9020 }, { "epoch": 6.974497681607419, "grad_norm": 0.7025278806686401, "learning_rate": 4.1157345520168106e-05, "loss": 0.5295, "num_input_tokens_seen": 3035456, "step": 9025 }, { "epoch": 6.978361669242658, "grad_norm": 0.8349624276161194, "learning_rate": 4.1144476304473575e-05, "loss": 0.4641, "num_input_tokens_seen": 3036992, "step": 9030 }, { "epoch": 6.9822256568778975, "grad_norm": 1.4349279403686523, "learning_rate": 4.113159974617454e-05, "loss": 0.5919, "num_input_tokens_seen": 3039040, "step": 9035 }, { "epoch": 6.986089644513138, "grad_norm": 0.5905850529670715, "learning_rate": 4.111871585112733e-05, "loss": 0.3688, "num_input_tokens_seen": 3041056, "step": 9040 }, { "epoch": 6.989953632148377, "grad_norm": 0.8184460401535034, "learning_rate": 4.1105824625191624e-05, "loss": 0.4998, "num_input_tokens_seen": 3042784, "step": 9045 }, { "epoch": 6.993817619783616, "grad_norm": 0.7203229069709778, "learning_rate": 4.109292607423042e-05, "loss": 0.366, "num_input_tokens_seen": 3044512, "step": 9050 }, { "epoch": 6.997681607418857, "grad_norm": 0.5740252733230591, "learning_rate": 4.108002020411006e-05, "loss": 0.4161, "num_input_tokens_seen": 3046304, "step": 9055 }, { "epoch": 7.0, "eval_loss": 0.49160444736480713, "eval_runtime": 6.3704, "eval_samples_per_second": 90.261, "eval_steps_per_second": 22.605, "num_input_tokens_seen": 3047200, "step": 9058 }, { "epoch": 7.001545595054096, "grad_norm": 0.7665523290634155, "learning_rate": 4.1067107020700216e-05, "loss": 0.3924, "num_input_tokens_seen": 3047968, "step": 9060 }, { "epoch": 7.005409582689335, "grad_norm": 0.6078484654426575, "learning_rate": 4.105418652987387e-05, "loss": 0.4034, "num_input_tokens_seen": 3049600, "step": 9065 }, { "epoch": 7.0092735703245745, "grad_norm": 0.8208014965057373, "learning_rate": 4.1041258737507347e-05, "loss": 0.3858, "num_input_tokens_seen": 3051328, "step": 9070 }, { "epoch": 7.013137557959815, "grad_norm": 1.4954732656478882, "learning_rate": 4.102832364948029e-05, "loss": 0.4902, "num_input_tokens_seen": 3053088, "step": 9075 }, { "epoch": 7.017001545595054, "grad_norm": 0.7987421154975891, "learning_rate": 4.101538127167564e-05, "loss": 0.5604, "num_input_tokens_seen": 3054848, "step": 9080 }, { "epoch": 7.020865533230293, "grad_norm": 0.9555924534797668, "learning_rate": 4.100243160997968e-05, "loss": 0.4761, "num_input_tokens_seen": 3056608, "step": 9085 }, { "epoch": 7.024729520865534, "grad_norm": 0.6967408657073975, "learning_rate": 4.0989474670281986e-05, "loss": 0.4389, "num_input_tokens_seen": 3058016, "step": 9090 }, { "epoch": 7.028593508500773, "grad_norm": 1.1020830869674683, "learning_rate": 4.097651045847546e-05, "loss": 0.4846, "num_input_tokens_seen": 3059424, "step": 9095 }, { "epoch": 7.032457496136012, "grad_norm": 1.664932370185852, "learning_rate": 4.096353898045628e-05, "loss": 0.3859, "num_input_tokens_seen": 3061024, "step": 9100 }, { "epoch": 7.0363214837712516, "grad_norm": 2.2532689571380615, "learning_rate": 4.095056024212399e-05, "loss": 0.3839, "num_input_tokens_seen": 3062496, "step": 9105 }, { "epoch": 7.040185471406492, "grad_norm": 0.7511253952980042, "learning_rate": 4.0937574249381375e-05, "loss": 0.4645, "num_input_tokens_seen": 3064192, "step": 9110 }, { "epoch": 7.044049459041731, "grad_norm": 0.8473935127258301, "learning_rate": 4.0924581008134544e-05, "loss": 0.5179, "num_input_tokens_seen": 3066144, "step": 9115 }, { "epoch": 7.04791344667697, "grad_norm": 1.5226105451583862, "learning_rate": 4.091158052429289e-05, "loss": 0.4895, "num_input_tokens_seen": 3067680, "step": 9120 }, { "epoch": 7.051777434312211, "grad_norm": 0.5627729296684265, "learning_rate": 4.089857280376914e-05, "loss": 0.4321, "num_input_tokens_seen": 3069568, "step": 9125 }, { "epoch": 7.05564142194745, "grad_norm": 1.2074183225631714, "learning_rate": 4.088555785247925e-05, "loss": 0.5975, "num_input_tokens_seen": 3071520, "step": 9130 }, { "epoch": 7.059505409582689, "grad_norm": 0.6954149007797241, "learning_rate": 4.087253567634253e-05, "loss": 0.3806, "num_input_tokens_seen": 3073248, "step": 9135 }, { "epoch": 7.063369397217929, "grad_norm": 1.14155113697052, "learning_rate": 4.085950628128151e-05, "loss": 0.475, "num_input_tokens_seen": 3074784, "step": 9140 }, { "epoch": 7.067233384853169, "grad_norm": 0.793301522731781, "learning_rate": 4.084646967322206e-05, "loss": 0.3901, "num_input_tokens_seen": 3076544, "step": 9145 }, { "epoch": 7.071097372488408, "grad_norm": 0.9733883142471313, "learning_rate": 4.083342585809331e-05, "loss": 0.4767, "num_input_tokens_seen": 3078080, "step": 9150 }, { "epoch": 7.074961360123647, "grad_norm": 0.647707998752594, "learning_rate": 4.082037484182766e-05, "loss": 0.5762, "num_input_tokens_seen": 3079872, "step": 9155 }, { "epoch": 7.078825347758887, "grad_norm": 0.8159125447273254, "learning_rate": 4.080731663036077e-05, "loss": 0.3899, "num_input_tokens_seen": 3081536, "step": 9160 }, { "epoch": 7.082689335394127, "grad_norm": 1.2559127807617188, "learning_rate": 4.0794251229631624e-05, "loss": 0.4132, "num_input_tokens_seen": 3083008, "step": 9165 }, { "epoch": 7.086553323029366, "grad_norm": 0.5760725140571594, "learning_rate": 4.078117864558243e-05, "loss": 0.3532, "num_input_tokens_seen": 3084512, "step": 9170 }, { "epoch": 7.090417310664606, "grad_norm": 0.7387211322784424, "learning_rate": 4.0768098884158674e-05, "loss": 0.4826, "num_input_tokens_seen": 3086208, "step": 9175 }, { "epoch": 7.094281298299846, "grad_norm": 1.1417045593261719, "learning_rate": 4.0755011951309115e-05, "loss": 0.4304, "num_input_tokens_seen": 3087872, "step": 9180 }, { "epoch": 7.098145285935085, "grad_norm": 1.0900622606277466, "learning_rate": 4.074191785298577e-05, "loss": 0.4702, "num_input_tokens_seen": 3089728, "step": 9185 }, { "epoch": 7.102009273570324, "grad_norm": 0.6103489995002747, "learning_rate": 4.07288165951439e-05, "loss": 0.469, "num_input_tokens_seen": 3091232, "step": 9190 }, { "epoch": 7.105873261205564, "grad_norm": 0.7993822693824768, "learning_rate": 4.071570818374206e-05, "loss": 0.3733, "num_input_tokens_seen": 3093088, "step": 9195 }, { "epoch": 7.109737248840804, "grad_norm": 0.7116100788116455, "learning_rate": 4.070259262474201e-05, "loss": 0.4509, "num_input_tokens_seen": 3094496, "step": 9200 }, { "epoch": 7.113601236476043, "grad_norm": 1.313056230545044, "learning_rate": 4.0689469924108804e-05, "loss": 0.6682, "num_input_tokens_seen": 3096032, "step": 9205 }, { "epoch": 7.117465224111283, "grad_norm": 1.2131439447402954, "learning_rate": 4.067634008781072e-05, "loss": 0.389, "num_input_tokens_seen": 3097824, "step": 9210 }, { "epoch": 7.121329211746523, "grad_norm": 0.7148282527923584, "learning_rate": 4.066320312181927e-05, "loss": 0.4525, "num_input_tokens_seen": 3099584, "step": 9215 }, { "epoch": 7.125193199381762, "grad_norm": 1.8929694890975952, "learning_rate": 4.065005903210923e-05, "loss": 0.6336, "num_input_tokens_seen": 3101312, "step": 9220 }, { "epoch": 7.129057187017001, "grad_norm": 0.6972530484199524, "learning_rate": 4.063690782465863e-05, "loss": 0.53, "num_input_tokens_seen": 3103136, "step": 9225 }, { "epoch": 7.132921174652241, "grad_norm": 0.6084619164466858, "learning_rate": 4.062374950544871e-05, "loss": 0.3581, "num_input_tokens_seen": 3104768, "step": 9230 }, { "epoch": 7.136785162287481, "grad_norm": 0.542906641960144, "learning_rate": 4.0610584080463946e-05, "loss": 0.3465, "num_input_tokens_seen": 3106432, "step": 9235 }, { "epoch": 7.14064914992272, "grad_norm": 1.1278331279754639, "learning_rate": 4.0597411555692044e-05, "loss": 0.4319, "num_input_tokens_seen": 3108096, "step": 9240 }, { "epoch": 7.14451313755796, "grad_norm": 0.8079805970191956, "learning_rate": 4.058423193712397e-05, "loss": 0.3972, "num_input_tokens_seen": 3109792, "step": 9245 }, { "epoch": 7.1483771251932, "grad_norm": 0.7839367985725403, "learning_rate": 4.057104523075387e-05, "loss": 0.4955, "num_input_tokens_seen": 3111712, "step": 9250 }, { "epoch": 7.152241112828439, "grad_norm": 1.8169798851013184, "learning_rate": 4.055785144257915e-05, "loss": 0.4825, "num_input_tokens_seen": 3113216, "step": 9255 }, { "epoch": 7.156105100463678, "grad_norm": 0.8210676312446594, "learning_rate": 4.054465057860043e-05, "loss": 0.3802, "num_input_tokens_seen": 3114944, "step": 9260 }, { "epoch": 7.159969088098918, "grad_norm": 0.7347766160964966, "learning_rate": 4.053144264482153e-05, "loss": 0.6366, "num_input_tokens_seen": 3116736, "step": 9265 }, { "epoch": 7.163833075734158, "grad_norm": 0.5769762396812439, "learning_rate": 4.0518227647249495e-05, "loss": 0.3189, "num_input_tokens_seen": 3118528, "step": 9270 }, { "epoch": 7.167697063369397, "grad_norm": 0.9543682932853699, "learning_rate": 4.0505005591894595e-05, "loss": 0.5149, "num_input_tokens_seen": 3120128, "step": 9275 }, { "epoch": 7.171561051004637, "grad_norm": 0.9920395612716675, "learning_rate": 4.04917764847703e-05, "loss": 0.5408, "num_input_tokens_seen": 3121696, "step": 9280 }, { "epoch": 7.175425038639876, "grad_norm": 1.2044073343276978, "learning_rate": 4.0478540331893263e-05, "loss": 0.3599, "num_input_tokens_seen": 3123392, "step": 9285 }, { "epoch": 7.179289026275116, "grad_norm": 0.8817614316940308, "learning_rate": 4.0465297139283406e-05, "loss": 0.4421, "num_input_tokens_seen": 3125024, "step": 9290 }, { "epoch": 7.183153013910355, "grad_norm": 0.9193201661109924, "learning_rate": 4.0452046912963794e-05, "loss": 0.4859, "num_input_tokens_seen": 3127104, "step": 9295 }, { "epoch": 7.187017001545595, "grad_norm": 0.9261096119880676, "learning_rate": 4.04387896589607e-05, "loss": 0.599, "num_input_tokens_seen": 3128704, "step": 9300 }, { "epoch": 7.190880989180835, "grad_norm": 2.2733988761901855, "learning_rate": 4.042552538330361e-05, "loss": 0.6573, "num_input_tokens_seen": 3130368, "step": 9305 }, { "epoch": 7.194744976816074, "grad_norm": 0.8930505514144897, "learning_rate": 4.041225409202519e-05, "loss": 0.3668, "num_input_tokens_seen": 3132320, "step": 9310 }, { "epoch": 7.198608964451314, "grad_norm": 0.8385687470436096, "learning_rate": 4.039897579116132e-05, "loss": 0.4837, "num_input_tokens_seen": 3134368, "step": 9315 }, { "epoch": 7.202472952086553, "grad_norm": 0.7742083668708801, "learning_rate": 4.038569048675103e-05, "loss": 0.4801, "num_input_tokens_seen": 3136160, "step": 9320 }, { "epoch": 7.206336939721793, "grad_norm": 1.5544378757476807, "learning_rate": 4.037239818483657e-05, "loss": 0.46, "num_input_tokens_seen": 3137888, "step": 9325 }, { "epoch": 7.210200927357032, "grad_norm": 1.3118422031402588, "learning_rate": 4.0359098891463355e-05, "loss": 0.5444, "num_input_tokens_seen": 3139648, "step": 9330 }, { "epoch": 7.214064914992272, "grad_norm": 0.8478004336357117, "learning_rate": 4.034579261267998e-05, "loss": 0.7151, "num_input_tokens_seen": 3141248, "step": 9335 }, { "epoch": 7.217928902627512, "grad_norm": 0.9137303829193115, "learning_rate": 4.033247935453822e-05, "loss": 0.5069, "num_input_tokens_seen": 3143040, "step": 9340 }, { "epoch": 7.221792890262751, "grad_norm": 0.708225667476654, "learning_rate": 4.031915912309303e-05, "loss": 0.6293, "num_input_tokens_seen": 3144768, "step": 9345 }, { "epoch": 7.225656877897991, "grad_norm": 0.5891476273536682, "learning_rate": 4.030583192440253e-05, "loss": 0.3608, "num_input_tokens_seen": 3146528, "step": 9350 }, { "epoch": 7.22952086553323, "grad_norm": 1.30864417552948, "learning_rate": 4.0292497764528e-05, "loss": 0.5443, "num_input_tokens_seen": 3148544, "step": 9355 }, { "epoch": 7.23338485316847, "grad_norm": 0.7166116833686829, "learning_rate": 4.027915664953391e-05, "loss": 0.6978, "num_input_tokens_seen": 3150208, "step": 9360 }, { "epoch": 7.2372488408037094, "grad_norm": 1.5980020761489868, "learning_rate": 4.0265808585487874e-05, "loss": 0.8124, "num_input_tokens_seen": 3151744, "step": 9365 }, { "epoch": 7.241112828438949, "grad_norm": 1.0793935060501099, "learning_rate": 4.0252453578460666e-05, "loss": 0.5587, "num_input_tokens_seen": 3153504, "step": 9370 }, { "epoch": 7.244976816074189, "grad_norm": 0.8739352226257324, "learning_rate": 4.023909163452623e-05, "loss": 0.4979, "num_input_tokens_seen": 3155200, "step": 9375 }, { "epoch": 7.248840803709428, "grad_norm": 0.6635655164718628, "learning_rate": 4.0225722759761656e-05, "loss": 0.3501, "num_input_tokens_seen": 3156736, "step": 9380 }, { "epoch": 7.252704791344668, "grad_norm": 0.6946961879730225, "learning_rate": 4.021234696024718e-05, "loss": 0.4156, "num_input_tokens_seen": 3158176, "step": 9385 }, { "epoch": 7.256568778979907, "grad_norm": 0.5797749161720276, "learning_rate": 4.0198964242066215e-05, "loss": 0.3204, "num_input_tokens_seen": 3159808, "step": 9390 }, { "epoch": 7.260432766615147, "grad_norm": 1.1516213417053223, "learning_rate": 4.018557461130528e-05, "loss": 0.5109, "num_input_tokens_seen": 3161216, "step": 9395 }, { "epoch": 7.2642967542503865, "grad_norm": 0.6943722367286682, "learning_rate": 4.017217807405407e-05, "loss": 0.458, "num_input_tokens_seen": 3162912, "step": 9400 }, { "epoch": 7.268160741885626, "grad_norm": 0.7584717869758606, "learning_rate": 4.015877463640542e-05, "loss": 0.3847, "num_input_tokens_seen": 3164480, "step": 9405 }, { "epoch": 7.272024729520865, "grad_norm": 1.0497462749481201, "learning_rate": 4.014536430445527e-05, "loss": 0.5696, "num_input_tokens_seen": 3166112, "step": 9410 }, { "epoch": 7.275888717156105, "grad_norm": 1.2863709926605225, "learning_rate": 4.013194708430273e-05, "loss": 0.6104, "num_input_tokens_seen": 3167872, "step": 9415 }, { "epoch": 7.279752704791345, "grad_norm": 1.7266300916671753, "learning_rate": 4.0118522982050045e-05, "loss": 0.398, "num_input_tokens_seen": 3169632, "step": 9420 }, { "epoch": 7.283616692426584, "grad_norm": 0.9532575011253357, "learning_rate": 4.010509200380257e-05, "loss": 0.3909, "num_input_tokens_seen": 3171616, "step": 9425 }, { "epoch": 7.287480680061824, "grad_norm": 1.2343792915344238, "learning_rate": 4.009165415566878e-05, "loss": 0.7583, "num_input_tokens_seen": 3173440, "step": 9430 }, { "epoch": 7.2913446676970635, "grad_norm": 0.973692774772644, "learning_rate": 4.007820944376031e-05, "loss": 0.4473, "num_input_tokens_seen": 3175072, "step": 9435 }, { "epoch": 7.295208655332303, "grad_norm": 1.0443471670150757, "learning_rate": 4.006475787419187e-05, "loss": 0.4397, "num_input_tokens_seen": 3177152, "step": 9440 }, { "epoch": 7.299072642967542, "grad_norm": 1.255399227142334, "learning_rate": 4.0051299453081337e-05, "loss": 0.4325, "num_input_tokens_seen": 3179104, "step": 9445 }, { "epoch": 7.302936630602782, "grad_norm": 0.9553871750831604, "learning_rate": 4.003783418654968e-05, "loss": 0.4477, "num_input_tokens_seen": 3180704, "step": 9450 }, { "epoch": 7.306800618238022, "grad_norm": 1.961591362953186, "learning_rate": 4.0024362080720964e-05, "loss": 0.6664, "num_input_tokens_seen": 3182400, "step": 9455 }, { "epoch": 7.310664605873261, "grad_norm": 1.4264377355575562, "learning_rate": 4.001088314172241e-05, "loss": 0.5478, "num_input_tokens_seen": 3184000, "step": 9460 }, { "epoch": 7.314528593508501, "grad_norm": 0.9828099012374878, "learning_rate": 3.9997397375684295e-05, "loss": 0.3605, "num_input_tokens_seen": 3185472, "step": 9465 }, { "epoch": 7.3183925811437405, "grad_norm": 1.7315175533294678, "learning_rate": 3.9983904788740044e-05, "loss": 0.4951, "num_input_tokens_seen": 3187200, "step": 9470 }, { "epoch": 7.32225656877898, "grad_norm": 0.9812268018722534, "learning_rate": 3.9970405387026165e-05, "loss": 0.3793, "num_input_tokens_seen": 3188672, "step": 9475 }, { "epoch": 7.326120556414219, "grad_norm": 0.9845016002655029, "learning_rate": 3.995689917668225e-05, "loss": 0.3975, "num_input_tokens_seen": 3190464, "step": 9480 }, { "epoch": 7.329984544049459, "grad_norm": 1.558713674545288, "learning_rate": 3.9943386163851025e-05, "loss": 0.5938, "num_input_tokens_seen": 3192128, "step": 9485 }, { "epoch": 7.333848531684699, "grad_norm": 0.8003351092338562, "learning_rate": 3.992986635467828e-05, "loss": 0.4591, "num_input_tokens_seen": 3193856, "step": 9490 }, { "epoch": 7.337712519319938, "grad_norm": 1.0666455030441284, "learning_rate": 3.99163397553129e-05, "loss": 0.4745, "num_input_tokens_seen": 3195328, "step": 9495 }, { "epoch": 7.341576506955178, "grad_norm": 0.5374940037727356, "learning_rate": 3.9902806371906875e-05, "loss": 0.6418, "num_input_tokens_seen": 3196992, "step": 9500 }, { "epoch": 7.3454404945904175, "grad_norm": 1.257023811340332, "learning_rate": 3.988926621061526e-05, "loss": 0.4287, "num_input_tokens_seen": 3198400, "step": 9505 }, { "epoch": 7.349304482225657, "grad_norm": 0.9816815257072449, "learning_rate": 3.9875719277596204e-05, "loss": 0.5048, "num_input_tokens_seen": 3200032, "step": 9510 }, { "epoch": 7.353168469860896, "grad_norm": 0.7576959133148193, "learning_rate": 3.986216557901094e-05, "loss": 0.424, "num_input_tokens_seen": 3201888, "step": 9515 }, { "epoch": 7.357032457496136, "grad_norm": 0.7029957175254822, "learning_rate": 3.9848605121023746e-05, "loss": 0.3654, "num_input_tokens_seen": 3203456, "step": 9520 }, { "epoch": 7.360896445131376, "grad_norm": 0.6127563714981079, "learning_rate": 3.9835037909802034e-05, "loss": 0.3795, "num_input_tokens_seen": 3205056, "step": 9525 }, { "epoch": 7.364760432766615, "grad_norm": 0.9199179410934448, "learning_rate": 3.9821463951516234e-05, "loss": 0.5094, "num_input_tokens_seen": 3206784, "step": 9530 }, { "epoch": 7.368624420401854, "grad_norm": 0.9048771858215332, "learning_rate": 3.980788325233986e-05, "loss": 0.3695, "num_input_tokens_seen": 3208160, "step": 9535 }, { "epoch": 7.3724884080370945, "grad_norm": 0.8141015768051147, "learning_rate": 3.9794295818449515e-05, "loss": 0.629, "num_input_tokens_seen": 3209760, "step": 9540 }, { "epoch": 7.376352395672334, "grad_norm": 0.822795569896698, "learning_rate": 3.9780701656024815e-05, "loss": 0.4253, "num_input_tokens_seen": 3211520, "step": 9545 }, { "epoch": 7.380216383307573, "grad_norm": 0.8787826299667358, "learning_rate": 3.97671007712485e-05, "loss": 0.4931, "num_input_tokens_seen": 3213248, "step": 9550 }, { "epoch": 7.384080370942813, "grad_norm": 1.0384248495101929, "learning_rate": 3.9753493170306314e-05, "loss": 0.5914, "num_input_tokens_seen": 3214912, "step": 9555 }, { "epoch": 7.387944358578053, "grad_norm": 1.9097832441329956, "learning_rate": 3.973987885938707e-05, "loss": 0.4099, "num_input_tokens_seen": 3216416, "step": 9560 }, { "epoch": 7.391808346213292, "grad_norm": 1.105542540550232, "learning_rate": 3.972625784468264e-05, "loss": 0.4653, "num_input_tokens_seen": 3218304, "step": 9565 }, { "epoch": 7.395672333848531, "grad_norm": 0.9647973775863647, "learning_rate": 3.9712630132387975e-05, "loss": 0.4149, "num_input_tokens_seen": 3219840, "step": 9570 }, { "epoch": 7.3995363214837715, "grad_norm": 1.1690970659255981, "learning_rate": 3.969899572870101e-05, "loss": 0.469, "num_input_tokens_seen": 3221408, "step": 9575 }, { "epoch": 7.403400309119011, "grad_norm": 0.7823034524917603, "learning_rate": 3.968535463982275e-05, "loss": 0.6454, "num_input_tokens_seen": 3223040, "step": 9580 }, { "epoch": 7.40726429675425, "grad_norm": 0.8332911133766174, "learning_rate": 3.967170687195725e-05, "loss": 0.4364, "num_input_tokens_seen": 3224672, "step": 9585 }, { "epoch": 7.41112828438949, "grad_norm": 1.9601926803588867, "learning_rate": 3.96580524313116e-05, "loss": 0.866, "num_input_tokens_seen": 3226560, "step": 9590 }, { "epoch": 7.41499227202473, "grad_norm": 1.0060348510742188, "learning_rate": 3.9644391324095925e-05, "loss": 0.548, "num_input_tokens_seen": 3228480, "step": 9595 }, { "epoch": 7.418856259659969, "grad_norm": 0.7604438662528992, "learning_rate": 3.963072355652338e-05, "loss": 0.5776, "num_input_tokens_seen": 3229920, "step": 9600 }, { "epoch": 7.422720247295208, "grad_norm": 1.890536904335022, "learning_rate": 3.961704913481012e-05, "loss": 0.7999, "num_input_tokens_seen": 3231904, "step": 9605 }, { "epoch": 7.4265842349304485, "grad_norm": 1.5376287698745728, "learning_rate": 3.96033680651754e-05, "loss": 0.5949, "num_input_tokens_seen": 3233824, "step": 9610 }, { "epoch": 7.430448222565688, "grad_norm": 1.1592553853988647, "learning_rate": 3.958968035384142e-05, "loss": 0.5555, "num_input_tokens_seen": 3235648, "step": 9615 }, { "epoch": 7.434312210200927, "grad_norm": 0.6810634732246399, "learning_rate": 3.957598600703344e-05, "loss": 0.4459, "num_input_tokens_seen": 3237504, "step": 9620 }, { "epoch": 7.438176197836167, "grad_norm": 1.0612707138061523, "learning_rate": 3.9562285030979727e-05, "loss": 0.4124, "num_input_tokens_seen": 3239360, "step": 9625 }, { "epoch": 7.442040185471407, "grad_norm": 0.7661236524581909, "learning_rate": 3.954857743191157e-05, "loss": 0.6723, "num_input_tokens_seen": 3240992, "step": 9630 }, { "epoch": 7.445904173106646, "grad_norm": 0.5392823219299316, "learning_rate": 3.953486321606328e-05, "loss": 0.3905, "num_input_tokens_seen": 3242848, "step": 9635 }, { "epoch": 7.449768160741885, "grad_norm": 0.7816034555435181, "learning_rate": 3.952114238967215e-05, "loss": 0.7492, "num_input_tokens_seen": 3244480, "step": 9640 }, { "epoch": 7.4536321483771255, "grad_norm": 1.0767958164215088, "learning_rate": 3.9507414958978494e-05, "loss": 0.3975, "num_input_tokens_seen": 3246016, "step": 9645 }, { "epoch": 7.457496136012365, "grad_norm": 0.8346661329269409, "learning_rate": 3.9493680930225626e-05, "loss": 0.4948, "num_input_tokens_seen": 3247424, "step": 9650 }, { "epoch": 7.461360123647604, "grad_norm": 0.7373729348182678, "learning_rate": 3.947994030965989e-05, "loss": 0.4685, "num_input_tokens_seen": 3248960, "step": 9655 }, { "epoch": 7.4652241112828435, "grad_norm": 0.6573053598403931, "learning_rate": 3.946619310353057e-05, "loss": 0.3785, "num_input_tokens_seen": 3250656, "step": 9660 }, { "epoch": 7.469088098918084, "grad_norm": 0.9944241642951965, "learning_rate": 3.945243931809e-05, "loss": 0.4699, "num_input_tokens_seen": 3252416, "step": 9665 }, { "epoch": 7.472952086553323, "grad_norm": 1.2153245210647583, "learning_rate": 3.943867895959347e-05, "loss": 0.5006, "num_input_tokens_seen": 3254112, "step": 9670 }, { "epoch": 7.476816074188562, "grad_norm": 1.0082063674926758, "learning_rate": 3.942491203429928e-05, "loss": 0.3805, "num_input_tokens_seen": 3255776, "step": 9675 }, { "epoch": 7.4806800618238025, "grad_norm": 0.6971703767776489, "learning_rate": 3.941113854846871e-05, "loss": 0.4368, "num_input_tokens_seen": 3257472, "step": 9680 }, { "epoch": 7.484544049459042, "grad_norm": 0.8187786936759949, "learning_rate": 3.939735850836602e-05, "loss": 0.5816, "num_input_tokens_seen": 3259168, "step": 9685 }, { "epoch": 7.488408037094281, "grad_norm": 0.5870392918586731, "learning_rate": 3.9383571920258465e-05, "loss": 0.3573, "num_input_tokens_seen": 3260608, "step": 9690 }, { "epoch": 7.4922720247295205, "grad_norm": 1.1987391710281372, "learning_rate": 3.936977879041626e-05, "loss": 0.5689, "num_input_tokens_seen": 3262336, "step": 9695 }, { "epoch": 7.496136012364761, "grad_norm": 0.9303717017173767, "learning_rate": 3.935597912511261e-05, "loss": 0.6064, "num_input_tokens_seen": 3264064, "step": 9700 }, { "epoch": 7.5, "grad_norm": 0.7821023464202881, "learning_rate": 3.934217293062367e-05, "loss": 0.3509, "num_input_tokens_seen": 3265984, "step": 9705 }, { "epoch": 7.503863987635239, "grad_norm": 0.7946360111236572, "learning_rate": 3.932836021322859e-05, "loss": 0.4306, "num_input_tokens_seen": 3267712, "step": 9710 }, { "epoch": 7.507727975270479, "grad_norm": 0.9570996761322021, "learning_rate": 3.931454097920947e-05, "loss": 0.4031, "num_input_tokens_seen": 3269536, "step": 9715 }, { "epoch": 7.511591962905719, "grad_norm": 1.0447319746017456, "learning_rate": 3.930071523485139e-05, "loss": 0.5579, "num_input_tokens_seen": 3271520, "step": 9720 }, { "epoch": 7.515455950540958, "grad_norm": 1.9550036191940308, "learning_rate": 3.928688298644238e-05, "loss": 0.5928, "num_input_tokens_seen": 3273088, "step": 9725 }, { "epoch": 7.5193199381761975, "grad_norm": 0.7511183619499207, "learning_rate": 3.9273044240273434e-05, "loss": 0.4771, "num_input_tokens_seen": 3274848, "step": 9730 }, { "epoch": 7.523183925811438, "grad_norm": 1.0806057453155518, "learning_rate": 3.925919900263848e-05, "loss": 0.4909, "num_input_tokens_seen": 3276576, "step": 9735 }, { "epoch": 7.527047913446677, "grad_norm": 0.8260632157325745, "learning_rate": 3.924534727983443e-05, "loss": 0.4005, "num_input_tokens_seen": 3278304, "step": 9740 }, { "epoch": 7.530911901081916, "grad_norm": 0.7740186452865601, "learning_rate": 3.923148907816112e-05, "loss": 0.3821, "num_input_tokens_seen": 3280160, "step": 9745 }, { "epoch": 7.5347758887171565, "grad_norm": 0.7298502326011658, "learning_rate": 3.921762440392135e-05, "loss": 0.3997, "num_input_tokens_seen": 3281824, "step": 9750 }, { "epoch": 7.538639876352396, "grad_norm": 0.7638840079307556, "learning_rate": 3.9203753263420854e-05, "loss": 0.5769, "num_input_tokens_seen": 3283648, "step": 9755 }, { "epoch": 7.542503863987635, "grad_norm": 0.7019403576850891, "learning_rate": 3.918987566296831e-05, "loss": 0.3423, "num_input_tokens_seen": 3285472, "step": 9760 }, { "epoch": 7.5463678516228745, "grad_norm": 0.5141065120697021, "learning_rate": 3.917599160887534e-05, "loss": 0.3543, "num_input_tokens_seen": 3287136, "step": 9765 }, { "epoch": 7.550231839258115, "grad_norm": 0.7514243721961975, "learning_rate": 3.916210110745648e-05, "loss": 0.4985, "num_input_tokens_seen": 3289024, "step": 9770 }, { "epoch": 7.554095826893354, "grad_norm": 0.7035478353500366, "learning_rate": 3.9148204165029235e-05, "loss": 0.3718, "num_input_tokens_seen": 3290624, "step": 9775 }, { "epoch": 7.557959814528593, "grad_norm": 0.790408730506897, "learning_rate": 3.9134300787914e-05, "loss": 0.3905, "num_input_tokens_seen": 3292416, "step": 9780 }, { "epoch": 7.561823802163833, "grad_norm": 1.131557822227478, "learning_rate": 3.912039098243412e-05, "loss": 0.4249, "num_input_tokens_seen": 3293920, "step": 9785 }, { "epoch": 7.565687789799073, "grad_norm": 0.9150944352149963, "learning_rate": 3.9106474754915856e-05, "loss": 0.5046, "num_input_tokens_seen": 3295776, "step": 9790 }, { "epoch": 7.569551777434312, "grad_norm": 1.2133347988128662, "learning_rate": 3.90925521116884e-05, "loss": 0.4095, "num_input_tokens_seen": 3297568, "step": 9795 }, { "epoch": 7.5734157650695515, "grad_norm": 0.7167171239852905, "learning_rate": 3.907862305908384e-05, "loss": 0.431, "num_input_tokens_seen": 3299360, "step": 9800 }, { "epoch": 7.577279752704792, "grad_norm": 0.8856620192527771, "learning_rate": 3.90646876034372e-05, "loss": 0.4122, "num_input_tokens_seen": 3300896, "step": 9805 }, { "epoch": 7.581143740340031, "grad_norm": 0.6301616430282593, "learning_rate": 3.905074575108641e-05, "loss": 0.3957, "num_input_tokens_seen": 3302432, "step": 9810 }, { "epoch": 7.58500772797527, "grad_norm": 1.0330426692962646, "learning_rate": 3.9036797508372306e-05, "loss": 0.4777, "num_input_tokens_seen": 3304064, "step": 9815 }, { "epoch": 7.58887171561051, "grad_norm": 0.6151154637336731, "learning_rate": 3.9022842881638624e-05, "loss": 0.5305, "num_input_tokens_seen": 3305792, "step": 9820 }, { "epoch": 7.59273570324575, "grad_norm": 1.1703745126724243, "learning_rate": 3.900888187723203e-05, "loss": 0.4752, "num_input_tokens_seen": 3307424, "step": 9825 }, { "epoch": 7.596599690880989, "grad_norm": 0.80409175157547, "learning_rate": 3.899491450150206e-05, "loss": 0.6072, "num_input_tokens_seen": 3309024, "step": 9830 }, { "epoch": 7.6004636785162285, "grad_norm": 0.7032650709152222, "learning_rate": 3.898094076080115e-05, "loss": 0.3691, "num_input_tokens_seen": 3310688, "step": 9835 }, { "epoch": 7.604327666151468, "grad_norm": 0.4462343454360962, "learning_rate": 3.8966960661484666e-05, "loss": 0.5148, "num_input_tokens_seen": 3312320, "step": 9840 }, { "epoch": 7.608191653786708, "grad_norm": 0.8444419503211975, "learning_rate": 3.895297420991083e-05, "loss": 0.3899, "num_input_tokens_seen": 3313792, "step": 9845 }, { "epoch": 7.612055641421947, "grad_norm": 0.7171887755393982, "learning_rate": 3.8938981412440755e-05, "loss": 0.4313, "num_input_tokens_seen": 3315392, "step": 9850 }, { "epoch": 7.615919629057187, "grad_norm": 0.663389265537262, "learning_rate": 3.892498227543846e-05, "loss": 0.4901, "num_input_tokens_seen": 3317024, "step": 9855 }, { "epoch": 7.619783616692427, "grad_norm": 1.5437928438186646, "learning_rate": 3.8910976805270837e-05, "loss": 0.6342, "num_input_tokens_seen": 3318784, "step": 9860 }, { "epoch": 7.623647604327666, "grad_norm": 0.8067506551742554, "learning_rate": 3.8896965008307646e-05, "loss": 0.388, "num_input_tokens_seen": 3320512, "step": 9865 }, { "epoch": 7.6275115919629055, "grad_norm": 0.4749433100223541, "learning_rate": 3.8882946890921545e-05, "loss": 0.7217, "num_input_tokens_seen": 3322336, "step": 9870 }, { "epoch": 7.631375579598146, "grad_norm": 0.9570181369781494, "learning_rate": 3.886892245948806e-05, "loss": 0.4955, "num_input_tokens_seen": 3324288, "step": 9875 }, { "epoch": 7.635239567233385, "grad_norm": 0.9018824100494385, "learning_rate": 3.885489172038558e-05, "loss": 0.582, "num_input_tokens_seen": 3325984, "step": 9880 }, { "epoch": 7.639103554868624, "grad_norm": 1.1531507968902588, "learning_rate": 3.884085467999537e-05, "loss": 0.5522, "num_input_tokens_seen": 3327680, "step": 9885 }, { "epoch": 7.642967542503864, "grad_norm": 1.3722370862960815, "learning_rate": 3.8826811344701565e-05, "loss": 0.6896, "num_input_tokens_seen": 3329248, "step": 9890 }, { "epoch": 7.646831530139104, "grad_norm": 0.9870447516441345, "learning_rate": 3.8812761720891156e-05, "loss": 0.4161, "num_input_tokens_seen": 3331104, "step": 9895 }, { "epoch": 7.650695517774343, "grad_norm": 1.8244184255599976, "learning_rate": 3.879870581495399e-05, "loss": 0.5577, "num_input_tokens_seen": 3332864, "step": 9900 }, { "epoch": 7.6545595054095825, "grad_norm": 0.8791717290878296, "learning_rate": 3.878464363328279e-05, "loss": 0.3762, "num_input_tokens_seen": 3334240, "step": 9905 }, { "epoch": 7.658423493044822, "grad_norm": 1.0358984470367432, "learning_rate": 3.8770575182273104e-05, "loss": 0.5142, "num_input_tokens_seen": 3335744, "step": 9910 }, { "epoch": 7.662287480680062, "grad_norm": 0.9391973614692688, "learning_rate": 3.8756500468323365e-05, "loss": 0.3682, "num_input_tokens_seen": 3337472, "step": 9915 }, { "epoch": 7.666151468315301, "grad_norm": 1.3855679035186768, "learning_rate": 3.874241949783483e-05, "loss": 0.3895, "num_input_tokens_seen": 3339424, "step": 9920 }, { "epoch": 7.670015455950541, "grad_norm": 0.7089762687683105, "learning_rate": 3.872833227721159e-05, "loss": 0.6738, "num_input_tokens_seen": 3341248, "step": 9925 }, { "epoch": 7.673879443585781, "grad_norm": 1.2481938600540161, "learning_rate": 3.871423881286062e-05, "loss": 0.6316, "num_input_tokens_seen": 3342976, "step": 9930 }, { "epoch": 7.67774343122102, "grad_norm": 0.6399189829826355, "learning_rate": 3.870013911119172e-05, "loss": 0.378, "num_input_tokens_seen": 3344480, "step": 9935 }, { "epoch": 7.6816074188562595, "grad_norm": 0.8808050751686096, "learning_rate": 3.8686033178617496e-05, "loss": 0.4194, "num_input_tokens_seen": 3345856, "step": 9940 }, { "epoch": 7.685471406491499, "grad_norm": 1.104719638824463, "learning_rate": 3.8671921021553427e-05, "loss": 0.6383, "num_input_tokens_seen": 3347552, "step": 9945 }, { "epoch": 7.689335394126739, "grad_norm": 0.8633230328559875, "learning_rate": 3.865780264641778e-05, "loss": 0.5895, "num_input_tokens_seen": 3349440, "step": 9950 }, { "epoch": 7.693199381761978, "grad_norm": 2.0350520610809326, "learning_rate": 3.864367805963172e-05, "loss": 0.6101, "num_input_tokens_seen": 3351232, "step": 9955 }, { "epoch": 7.697063369397218, "grad_norm": 0.6164402961730957, "learning_rate": 3.862954726761916e-05, "loss": 0.5189, "num_input_tokens_seen": 3352832, "step": 9960 }, { "epoch": 7.700927357032457, "grad_norm": 0.952416718006134, "learning_rate": 3.8615410276806874e-05, "loss": 0.5334, "num_input_tokens_seen": 3354496, "step": 9965 }, { "epoch": 7.704791344667697, "grad_norm": 1.0631064176559448, "learning_rate": 3.860126709362446e-05, "loss": 0.4658, "num_input_tokens_seen": 3356224, "step": 9970 }, { "epoch": 7.7086553323029365, "grad_norm": 1.1840410232543945, "learning_rate": 3.858711772450432e-05, "loss": 0.4984, "num_input_tokens_seen": 3358144, "step": 9975 }, { "epoch": 7.712519319938176, "grad_norm": 0.7962307929992676, "learning_rate": 3.857296217588167e-05, "loss": 0.5246, "num_input_tokens_seen": 3359424, "step": 9980 }, { "epoch": 7.716383307573416, "grad_norm": 0.7513737082481384, "learning_rate": 3.8558800454194524e-05, "loss": 0.5798, "num_input_tokens_seen": 3361408, "step": 9985 }, { "epoch": 7.720247295208655, "grad_norm": 0.9939054250717163, "learning_rate": 3.854463256588374e-05, "loss": 0.422, "num_input_tokens_seen": 3363072, "step": 9990 }, { "epoch": 7.724111282843895, "grad_norm": 0.7359746098518372, "learning_rate": 3.853045851739295e-05, "loss": 0.3839, "num_input_tokens_seen": 3364832, "step": 9995 }, { "epoch": 7.727975270479135, "grad_norm": 1.4284530878067017, "learning_rate": 3.851627831516859e-05, "loss": 0.8164, "num_input_tokens_seen": 3366272, "step": 10000 }, { "epoch": 7.731839258114374, "grad_norm": 0.9110018610954285, "learning_rate": 3.850209196565991e-05, "loss": 0.4522, "num_input_tokens_seen": 3367968, "step": 10005 }, { "epoch": 7.7357032457496135, "grad_norm": 0.9356684684753418, "learning_rate": 3.848789947531895e-05, "loss": 0.3738, "num_input_tokens_seen": 3369440, "step": 10010 }, { "epoch": 7.739567233384853, "grad_norm": 0.5649460554122925, "learning_rate": 3.847370085060052e-05, "loss": 0.4614, "num_input_tokens_seen": 3371168, "step": 10015 }, { "epoch": 7.743431221020093, "grad_norm": 0.836029052734375, "learning_rate": 3.845949609796226e-05, "loss": 0.4637, "num_input_tokens_seen": 3372672, "step": 10020 }, { "epoch": 7.747295208655332, "grad_norm": 0.6755355596542358, "learning_rate": 3.844528522386457e-05, "loss": 0.4957, "num_input_tokens_seen": 3374304, "step": 10025 }, { "epoch": 7.751159196290572, "grad_norm": 0.59604412317276, "learning_rate": 3.8431068234770636e-05, "loss": 0.4198, "num_input_tokens_seen": 3375872, "step": 10030 }, { "epoch": 7.755023183925811, "grad_norm": 0.6242092847824097, "learning_rate": 3.841684513714643e-05, "loss": 0.4429, "num_input_tokens_seen": 3377600, "step": 10035 }, { "epoch": 7.758887171561051, "grad_norm": 0.7559676766395569, "learning_rate": 3.840261593746071e-05, "loss": 0.4472, "num_input_tokens_seen": 3379584, "step": 10040 }, { "epoch": 7.7627511591962906, "grad_norm": 1.4435524940490723, "learning_rate": 3.8388380642184993e-05, "loss": 0.4733, "num_input_tokens_seen": 3381472, "step": 10045 }, { "epoch": 7.76661514683153, "grad_norm": 0.8283237814903259, "learning_rate": 3.8374139257793586e-05, "loss": 0.4711, "num_input_tokens_seen": 3383360, "step": 10050 }, { "epoch": 7.77047913446677, "grad_norm": 1.2238843441009521, "learning_rate": 3.8359891790763546e-05, "loss": 0.4711, "num_input_tokens_seen": 3385312, "step": 10055 }, { "epoch": 7.774343122102009, "grad_norm": 0.8723092675209045, "learning_rate": 3.834563824757471e-05, "loss": 0.4266, "num_input_tokens_seen": 3387072, "step": 10060 }, { "epoch": 7.778207109737249, "grad_norm": 0.7643846869468689, "learning_rate": 3.833137863470968e-05, "loss": 0.4955, "num_input_tokens_seen": 3388576, "step": 10065 }, { "epoch": 7.782071097372488, "grad_norm": 0.7580907940864563, "learning_rate": 3.83171129586538e-05, "loss": 0.395, "num_input_tokens_seen": 3390368, "step": 10070 }, { "epoch": 7.785935085007728, "grad_norm": 0.661207914352417, "learning_rate": 3.8302841225895204e-05, "loss": 0.4351, "num_input_tokens_seen": 3391808, "step": 10075 }, { "epoch": 7.789799072642968, "grad_norm": 0.7030502557754517, "learning_rate": 3.8288563442924746e-05, "loss": 0.4255, "num_input_tokens_seen": 3393600, "step": 10080 }, { "epoch": 7.793663060278207, "grad_norm": 0.5856496095657349, "learning_rate": 3.827427961623604e-05, "loss": 0.4442, "num_input_tokens_seen": 3395232, "step": 10085 }, { "epoch": 7.797527047913446, "grad_norm": 0.8017972707748413, "learning_rate": 3.825998975232549e-05, "loss": 0.4703, "num_input_tokens_seen": 3396800, "step": 10090 }, { "epoch": 7.801391035548686, "grad_norm": 0.5475521683692932, "learning_rate": 3.824569385769218e-05, "loss": 0.7555, "num_input_tokens_seen": 3398432, "step": 10095 }, { "epoch": 7.805255023183926, "grad_norm": 1.360075831413269, "learning_rate": 3.8231391938837966e-05, "loss": 0.4343, "num_input_tokens_seen": 3400096, "step": 10100 }, { "epoch": 7.809119010819165, "grad_norm": 1.0338289737701416, "learning_rate": 3.821708400226747e-05, "loss": 0.469, "num_input_tokens_seen": 3401760, "step": 10105 }, { "epoch": 7.812982998454405, "grad_norm": 1.0208146572113037, "learning_rate": 3.820277005448802e-05, "loss": 0.4409, "num_input_tokens_seen": 3403392, "step": 10110 }, { "epoch": 7.816846986089645, "grad_norm": 0.9352439045906067, "learning_rate": 3.8188450102009674e-05, "loss": 0.5405, "num_input_tokens_seen": 3405056, "step": 10115 }, { "epoch": 7.820710973724884, "grad_norm": 0.5761824250221252, "learning_rate": 3.817412415134525e-05, "loss": 0.355, "num_input_tokens_seen": 3406816, "step": 10120 }, { "epoch": 7.824574961360124, "grad_norm": 1.0345910787582397, "learning_rate": 3.815979220901025e-05, "loss": 0.4316, "num_input_tokens_seen": 3408640, "step": 10125 }, { "epoch": 7.828438948995363, "grad_norm": 0.6301133036613464, "learning_rate": 3.814545428152295e-05, "loss": 0.3735, "num_input_tokens_seen": 3410304, "step": 10130 }, { "epoch": 7.832302936630603, "grad_norm": 0.7576345801353455, "learning_rate": 3.813111037540432e-05, "loss": 0.4408, "num_input_tokens_seen": 3411840, "step": 10135 }, { "epoch": 7.836166924265842, "grad_norm": 0.7982921004295349, "learning_rate": 3.811676049717805e-05, "loss": 0.4048, "num_input_tokens_seen": 3413696, "step": 10140 }, { "epoch": 7.840030911901082, "grad_norm": 0.7475820183753967, "learning_rate": 3.810240465337055e-05, "loss": 0.425, "num_input_tokens_seen": 3415232, "step": 10145 }, { "epoch": 7.843894899536322, "grad_norm": 0.6947832107543945, "learning_rate": 3.8088042850510946e-05, "loss": 0.3615, "num_input_tokens_seen": 3416992, "step": 10150 }, { "epoch": 7.847758887171561, "grad_norm": 1.300175666809082, "learning_rate": 3.8073675095131076e-05, "loss": 0.4482, "num_input_tokens_seen": 3418688, "step": 10155 }, { "epoch": 7.8516228748068, "grad_norm": 0.9273701310157776, "learning_rate": 3.805930139376548e-05, "loss": 0.5616, "num_input_tokens_seen": 3420128, "step": 10160 }, { "epoch": 7.85548686244204, "grad_norm": 1.4274225234985352, "learning_rate": 3.804492175295139e-05, "loss": 0.416, "num_input_tokens_seen": 3421888, "step": 10165 }, { "epoch": 7.85935085007728, "grad_norm": 0.5816029906272888, "learning_rate": 3.803053617922877e-05, "loss": 0.5304, "num_input_tokens_seen": 3423488, "step": 10170 }, { "epoch": 7.863214837712519, "grad_norm": 0.6593968272209167, "learning_rate": 3.801614467914026e-05, "loss": 0.5184, "num_input_tokens_seen": 3425152, "step": 10175 }, { "epoch": 7.867078825347759, "grad_norm": 1.2633249759674072, "learning_rate": 3.80017472592312e-05, "loss": 0.4556, "num_input_tokens_seen": 3426656, "step": 10180 }, { "epoch": 7.870942812982999, "grad_norm": 0.7598260641098022, "learning_rate": 3.798734392604962e-05, "loss": 0.3911, "num_input_tokens_seen": 3428416, "step": 10185 }, { "epoch": 7.874806800618238, "grad_norm": 0.7532007098197937, "learning_rate": 3.7972934686146245e-05, "loss": 0.4346, "num_input_tokens_seen": 3430176, "step": 10190 }, { "epoch": 7.878670788253477, "grad_norm": 0.8734409809112549, "learning_rate": 3.795851954607448e-05, "loss": 0.4988, "num_input_tokens_seen": 3431840, "step": 10195 }, { "epoch": 7.882534775888717, "grad_norm": 0.8543374538421631, "learning_rate": 3.7944098512390426e-05, "loss": 0.4595, "num_input_tokens_seen": 3433312, "step": 10200 }, { "epoch": 7.886398763523957, "grad_norm": 1.2260394096374512, "learning_rate": 3.7929671591652843e-05, "loss": 0.7807, "num_input_tokens_seen": 3435168, "step": 10205 }, { "epoch": 7.890262751159196, "grad_norm": 0.8681738376617432, "learning_rate": 3.791523879042319e-05, "loss": 0.4924, "num_input_tokens_seen": 3436768, "step": 10210 }, { "epoch": 7.894126738794435, "grad_norm": 0.9839980602264404, "learning_rate": 3.790080011526559e-05, "loss": 0.5861, "num_input_tokens_seen": 3438464, "step": 10215 }, { "epoch": 7.897990726429676, "grad_norm": 1.4618967771530151, "learning_rate": 3.788635557274684e-05, "loss": 0.6972, "num_input_tokens_seen": 3439840, "step": 10220 }, { "epoch": 7.901854714064915, "grad_norm": 0.7342026829719543, "learning_rate": 3.78719051694364e-05, "loss": 0.4595, "num_input_tokens_seen": 3441440, "step": 10225 }, { "epoch": 7.905718701700154, "grad_norm": 0.9721516370773315, "learning_rate": 3.78574489119064e-05, "loss": 0.5807, "num_input_tokens_seen": 3443232, "step": 10230 }, { "epoch": 7.909582689335394, "grad_norm": 0.6224435567855835, "learning_rate": 3.784298680673164e-05, "loss": 0.5314, "num_input_tokens_seen": 3444896, "step": 10235 }, { "epoch": 7.913446676970634, "grad_norm": 0.48829931020736694, "learning_rate": 3.782851886048956e-05, "loss": 0.3785, "num_input_tokens_seen": 3446656, "step": 10240 }, { "epoch": 7.917310664605873, "grad_norm": 0.7709723711013794, "learning_rate": 3.781404507976027e-05, "loss": 0.5213, "num_input_tokens_seen": 3448608, "step": 10245 }, { "epoch": 7.921174652241113, "grad_norm": 2.041872024536133, "learning_rate": 3.779956547112655e-05, "loss": 0.5759, "num_input_tokens_seen": 3450240, "step": 10250 }, { "epoch": 7.925038639876353, "grad_norm": 0.8397512435913086, "learning_rate": 3.77850800411738e-05, "loss": 0.4436, "num_input_tokens_seen": 3452000, "step": 10255 }, { "epoch": 7.928902627511592, "grad_norm": 0.66033536195755, "learning_rate": 3.777058879649007e-05, "loss": 0.4675, "num_input_tokens_seen": 3453600, "step": 10260 }, { "epoch": 7.932766615146831, "grad_norm": 1.0662037134170532, "learning_rate": 3.7756091743666086e-05, "loss": 0.4451, "num_input_tokens_seen": 3455392, "step": 10265 }, { "epoch": 7.936630602782071, "grad_norm": 1.1133954524993896, "learning_rate": 3.7741588889295173e-05, "loss": 0.4546, "num_input_tokens_seen": 3457024, "step": 10270 }, { "epoch": 7.940494590417311, "grad_norm": 1.0217646360397339, "learning_rate": 3.7727080239973337e-05, "loss": 0.4059, "num_input_tokens_seen": 3458592, "step": 10275 }, { "epoch": 7.94435857805255, "grad_norm": 0.700514018535614, "learning_rate": 3.7712565802299185e-05, "loss": 0.3661, "num_input_tokens_seen": 3460384, "step": 10280 }, { "epoch": 7.948222565687789, "grad_norm": 1.1865845918655396, "learning_rate": 3.769804558287397e-05, "loss": 0.4436, "num_input_tokens_seen": 3461824, "step": 10285 }, { "epoch": 7.95208655332303, "grad_norm": 0.6260192394256592, "learning_rate": 3.768351958830159e-05, "loss": 0.4182, "num_input_tokens_seen": 3463584, "step": 10290 }, { "epoch": 7.955950540958269, "grad_norm": 0.5880019068717957, "learning_rate": 3.766898782518853e-05, "loss": 0.4145, "num_input_tokens_seen": 3465152, "step": 10295 }, { "epoch": 7.959814528593508, "grad_norm": 0.885270357131958, "learning_rate": 3.765445030014394e-05, "loss": 0.4266, "num_input_tokens_seen": 3466688, "step": 10300 }, { "epoch": 7.9636785162287484, "grad_norm": 0.5300600528717041, "learning_rate": 3.7639907019779565e-05, "loss": 0.6958, "num_input_tokens_seen": 3468128, "step": 10305 }, { "epoch": 7.967542503863988, "grad_norm": 0.6600220203399658, "learning_rate": 3.762535799070978e-05, "loss": 0.52, "num_input_tokens_seen": 3469792, "step": 10310 }, { "epoch": 7.971406491499227, "grad_norm": 1.3054265975952148, "learning_rate": 3.7610803219551574e-05, "loss": 0.4531, "num_input_tokens_seen": 3471424, "step": 10315 }, { "epoch": 7.975270479134466, "grad_norm": 0.549160361289978, "learning_rate": 3.7596242712924544e-05, "loss": 0.3638, "num_input_tokens_seen": 3473024, "step": 10320 }, { "epoch": 7.979134466769707, "grad_norm": 0.668687641620636, "learning_rate": 3.758167647745089e-05, "loss": 0.5279, "num_input_tokens_seen": 3474496, "step": 10325 }, { "epoch": 7.982998454404946, "grad_norm": 0.6848587393760681, "learning_rate": 3.756710451975543e-05, "loss": 0.3661, "num_input_tokens_seen": 3475968, "step": 10330 }, { "epoch": 7.986862442040185, "grad_norm": 0.8755740523338318, "learning_rate": 3.7552526846465565e-05, "loss": 0.5138, "num_input_tokens_seen": 3477792, "step": 10335 }, { "epoch": 7.990726429675425, "grad_norm": 0.7284384369850159, "learning_rate": 3.7537943464211314e-05, "loss": 0.4783, "num_input_tokens_seen": 3479264, "step": 10340 }, { "epoch": 7.994590417310665, "grad_norm": 0.6342600584030151, "learning_rate": 3.752335437962529e-05, "loss": 0.5301, "num_input_tokens_seen": 3480800, "step": 10345 }, { "epoch": 7.998454404945904, "grad_norm": 0.7193635106086731, "learning_rate": 3.7508759599342696e-05, "loss": 0.522, "num_input_tokens_seen": 3482336, "step": 10350 }, { "epoch": 8.0, "eval_loss": 0.48501694202423096, "eval_runtime": 6.3649, "eval_samples_per_second": 90.339, "eval_steps_per_second": 22.624, "num_input_tokens_seen": 3482720, "step": 10352 }, { "epoch": 8.002318392581143, "grad_norm": 3.0819053649902344, "learning_rate": 3.749415913000133e-05, "loss": 0.6288, "num_input_tokens_seen": 3483776, "step": 10355 }, { "epoch": 8.006182380216384, "grad_norm": 0.8646562695503235, "learning_rate": 3.7479552978241564e-05, "loss": 0.4056, "num_input_tokens_seen": 3485568, "step": 10360 }, { "epoch": 8.010046367851622, "grad_norm": 0.6020779013633728, "learning_rate": 3.746494115070636e-05, "loss": 0.403, "num_input_tokens_seen": 3487072, "step": 10365 }, { "epoch": 8.013910355486862, "grad_norm": 0.6407784819602966, "learning_rate": 3.745032365404127e-05, "loss": 0.5812, "num_input_tokens_seen": 3488832, "step": 10370 }, { "epoch": 8.017774343122102, "grad_norm": 0.8127278089523315, "learning_rate": 3.7435700494894434e-05, "loss": 0.4088, "num_input_tokens_seen": 3490688, "step": 10375 }, { "epoch": 8.021638330757341, "grad_norm": 0.8076125979423523, "learning_rate": 3.742107167991653e-05, "loss": 0.4776, "num_input_tokens_seen": 3492448, "step": 10380 }, { "epoch": 8.025502318392581, "grad_norm": 1.836619257926941, "learning_rate": 3.7406437215760836e-05, "loss": 0.4561, "num_input_tokens_seen": 3494048, "step": 10385 }, { "epoch": 8.029366306027821, "grad_norm": 0.7890726923942566, "learning_rate": 3.73917971090832e-05, "loss": 0.6039, "num_input_tokens_seen": 3495552, "step": 10390 }, { "epoch": 8.03323029366306, "grad_norm": 1.3161169290542603, "learning_rate": 3.737715136654203e-05, "loss": 0.6224, "num_input_tokens_seen": 3497344, "step": 10395 }, { "epoch": 8.0370942812983, "grad_norm": 0.6644477248191833, "learning_rate": 3.7362499994798296e-05, "loss": 0.4459, "num_input_tokens_seen": 3498880, "step": 10400 }, { "epoch": 8.04095826893354, "grad_norm": 0.816883385181427, "learning_rate": 3.734784300051552e-05, "loss": 0.5369, "num_input_tokens_seen": 3500448, "step": 10405 }, { "epoch": 8.044822256568779, "grad_norm": 0.9900310039520264, "learning_rate": 3.7333180390359805e-05, "loss": 0.5005, "num_input_tokens_seen": 3502080, "step": 10410 }, { "epoch": 8.048686244204019, "grad_norm": 1.3318735361099243, "learning_rate": 3.731851217099979e-05, "loss": 0.4823, "num_input_tokens_seen": 3503776, "step": 10415 }, { "epoch": 8.052550231839259, "grad_norm": 0.826292097568512, "learning_rate": 3.730383834910666e-05, "loss": 0.4125, "num_input_tokens_seen": 3505632, "step": 10420 }, { "epoch": 8.056414219474497, "grad_norm": 1.2048100233078003, "learning_rate": 3.728915893135417e-05, "loss": 0.5795, "num_input_tokens_seen": 3507520, "step": 10425 }, { "epoch": 8.060278207109738, "grad_norm": 1.1297597885131836, "learning_rate": 3.72744739244186e-05, "loss": 0.4011, "num_input_tokens_seen": 3509088, "step": 10430 }, { "epoch": 8.064142194744976, "grad_norm": 0.6235809922218323, "learning_rate": 3.7259783334978775e-05, "loss": 0.3758, "num_input_tokens_seen": 3510880, "step": 10435 }, { "epoch": 8.068006182380216, "grad_norm": 1.118224859237671, "learning_rate": 3.724508716971607e-05, "loss": 0.3461, "num_input_tokens_seen": 3512512, "step": 10440 }, { "epoch": 8.071870170015456, "grad_norm": 1.0883686542510986, "learning_rate": 3.7230385435314397e-05, "loss": 0.6121, "num_input_tokens_seen": 3514144, "step": 10445 }, { "epoch": 8.075734157650695, "grad_norm": 0.7521071434020996, "learning_rate": 3.7215678138460176e-05, "loss": 0.3532, "num_input_tokens_seen": 3515648, "step": 10450 }, { "epoch": 8.079598145285935, "grad_norm": 2.015772819519043, "learning_rate": 3.7200965285842384e-05, "loss": 0.723, "num_input_tokens_seen": 3517312, "step": 10455 }, { "epoch": 8.083462132921175, "grad_norm": 0.638363242149353, "learning_rate": 3.7186246884152505e-05, "loss": 0.5006, "num_input_tokens_seen": 3519296, "step": 10460 }, { "epoch": 8.087326120556414, "grad_norm": 1.8918392658233643, "learning_rate": 3.717152294008456e-05, "loss": 0.6925, "num_input_tokens_seen": 3521280, "step": 10465 }, { "epoch": 8.091190108191654, "grad_norm": 0.6099896430969238, "learning_rate": 3.7156793460335096e-05, "loss": 0.5639, "num_input_tokens_seen": 3522944, "step": 10470 }, { "epoch": 8.095054095826894, "grad_norm": 0.8733569979667664, "learning_rate": 3.714205845160316e-05, "loss": 0.4714, "num_input_tokens_seen": 3524480, "step": 10475 }, { "epoch": 8.098918083462133, "grad_norm": 1.3426610231399536, "learning_rate": 3.7127317920590324e-05, "loss": 0.4544, "num_input_tokens_seen": 3526048, "step": 10480 }, { "epoch": 8.102782071097373, "grad_norm": 0.625437319278717, "learning_rate": 3.7112571874000676e-05, "loss": 0.4851, "num_input_tokens_seen": 3527776, "step": 10485 }, { "epoch": 8.106646058732611, "grad_norm": 0.9567176103591919, "learning_rate": 3.709782031854079e-05, "loss": 0.6365, "num_input_tokens_seen": 3529728, "step": 10490 }, { "epoch": 8.110510046367851, "grad_norm": 0.6821291446685791, "learning_rate": 3.708306326091978e-05, "loss": 0.3464, "num_input_tokens_seen": 3531200, "step": 10495 }, { "epoch": 8.114374034003092, "grad_norm": 0.5719438791275024, "learning_rate": 3.706830070784924e-05, "loss": 0.4094, "num_input_tokens_seen": 3532992, "step": 10500 }, { "epoch": 8.11823802163833, "grad_norm": 1.0854618549346924, "learning_rate": 3.705353266604326e-05, "loss": 0.4327, "num_input_tokens_seen": 3534720, "step": 10505 }, { "epoch": 8.12210200927357, "grad_norm": 0.9601066708564758, "learning_rate": 3.703875914221843e-05, "loss": 0.6455, "num_input_tokens_seen": 3536512, "step": 10510 }, { "epoch": 8.12596599690881, "grad_norm": 1.1677578687667847, "learning_rate": 3.702398014309385e-05, "loss": 0.4994, "num_input_tokens_seen": 3538048, "step": 10515 }, { "epoch": 8.129829984544049, "grad_norm": 0.9245625138282776, "learning_rate": 3.7009195675391096e-05, "loss": 0.3819, "num_input_tokens_seen": 3539872, "step": 10520 }, { "epoch": 8.13369397217929, "grad_norm": 1.0283533334732056, "learning_rate": 3.699440574583423e-05, "loss": 0.4484, "num_input_tokens_seen": 3541472, "step": 10525 }, { "epoch": 8.13755795981453, "grad_norm": 1.0464712381362915, "learning_rate": 3.6979610361149785e-05, "loss": 0.5581, "num_input_tokens_seen": 3543104, "step": 10530 }, { "epoch": 8.141421947449768, "grad_norm": 0.7982766628265381, "learning_rate": 3.6964809528066814e-05, "loss": 0.5913, "num_input_tokens_seen": 3544832, "step": 10535 }, { "epoch": 8.145285935085008, "grad_norm": 0.6868427991867065, "learning_rate": 3.6950003253316816e-05, "loss": 0.4197, "num_input_tokens_seen": 3546432, "step": 10540 }, { "epoch": 8.149149922720248, "grad_norm": 0.6428138613700867, "learning_rate": 3.6935191543633776e-05, "loss": 0.4137, "num_input_tokens_seen": 3548448, "step": 10545 }, { "epoch": 8.153013910355487, "grad_norm": 0.6053665280342102, "learning_rate": 3.6920374405754134e-05, "loss": 0.3745, "num_input_tokens_seen": 3550016, "step": 10550 }, { "epoch": 8.156877897990727, "grad_norm": 1.0845657587051392, "learning_rate": 3.690555184641683e-05, "loss": 0.446, "num_input_tokens_seen": 3551936, "step": 10555 }, { "epoch": 8.160741885625965, "grad_norm": 0.8904715180397034, "learning_rate": 3.6890723872363256e-05, "loss": 0.378, "num_input_tokens_seen": 3553408, "step": 10560 }, { "epoch": 8.164605873261205, "grad_norm": 0.9770016074180603, "learning_rate": 3.687589049033724e-05, "loss": 0.4314, "num_input_tokens_seen": 3555040, "step": 10565 }, { "epoch": 8.168469860896446, "grad_norm": 1.108154058456421, "learning_rate": 3.686105170708511e-05, "loss": 0.4642, "num_input_tokens_seen": 3556736, "step": 10570 }, { "epoch": 8.172333848531684, "grad_norm": 0.6732720732688904, "learning_rate": 3.684620752935564e-05, "loss": 0.3555, "num_input_tokens_seen": 3558464, "step": 10575 }, { "epoch": 8.176197836166924, "grad_norm": 0.8624832034111023, "learning_rate": 3.683135796390003e-05, "loss": 0.3731, "num_input_tokens_seen": 3560192, "step": 10580 }, { "epoch": 8.180061823802165, "grad_norm": 0.7193707227706909, "learning_rate": 3.681650301747196e-05, "loss": 0.3768, "num_input_tokens_seen": 3561920, "step": 10585 }, { "epoch": 8.183925811437403, "grad_norm": 0.9742674827575684, "learning_rate": 3.680164269682756e-05, "loss": 0.5145, "num_input_tokens_seen": 3563552, "step": 10590 }, { "epoch": 8.187789799072643, "grad_norm": 0.8588480353355408, "learning_rate": 3.678677700872539e-05, "loss": 0.3871, "num_input_tokens_seen": 3564896, "step": 10595 }, { "epoch": 8.191653786707883, "grad_norm": 0.9401132464408875, "learning_rate": 3.677190595992644e-05, "loss": 0.4898, "num_input_tokens_seen": 3566528, "step": 10600 }, { "epoch": 8.195517774343122, "grad_norm": 1.1735243797302246, "learning_rate": 3.675702955719416e-05, "loss": 0.4185, "num_input_tokens_seen": 3568288, "step": 10605 }, { "epoch": 8.199381761978362, "grad_norm": 0.5801627039909363, "learning_rate": 3.674214780729443e-05, "loss": 0.4981, "num_input_tokens_seen": 3569792, "step": 10610 }, { "epoch": 8.2032457496136, "grad_norm": 1.1900784969329834, "learning_rate": 3.6727260716995555e-05, "loss": 0.3861, "num_input_tokens_seen": 3571552, "step": 10615 }, { "epoch": 8.20710973724884, "grad_norm": 0.9262180924415588, "learning_rate": 3.6712368293068274e-05, "loss": 0.3915, "num_input_tokens_seen": 3573088, "step": 10620 }, { "epoch": 8.21097372488408, "grad_norm": 1.1880384683609009, "learning_rate": 3.669747054228575e-05, "loss": 0.4357, "num_input_tokens_seen": 3574784, "step": 10625 }, { "epoch": 8.21483771251932, "grad_norm": 2.1480672359466553, "learning_rate": 3.668256747142357e-05, "loss": 0.5744, "num_input_tokens_seen": 3576352, "step": 10630 }, { "epoch": 8.21870170015456, "grad_norm": 1.3915982246398926, "learning_rate": 3.666765908725974e-05, "loss": 0.5048, "num_input_tokens_seen": 3578144, "step": 10635 }, { "epoch": 8.2225656877898, "grad_norm": 1.1712702512741089, "learning_rate": 3.6652745396574685e-05, "loss": 0.4859, "num_input_tokens_seen": 3580096, "step": 10640 }, { "epoch": 8.226429675425038, "grad_norm": 0.9068136215209961, "learning_rate": 3.6637826406151236e-05, "loss": 0.5259, "num_input_tokens_seen": 3582016, "step": 10645 }, { "epoch": 8.230293663060278, "grad_norm": 0.8444743156433105, "learning_rate": 3.662290212277464e-05, "loss": 0.4515, "num_input_tokens_seen": 3583968, "step": 10650 }, { "epoch": 8.234157650695519, "grad_norm": 1.0071724653244019, "learning_rate": 3.660797255323255e-05, "loss": 0.4335, "num_input_tokens_seen": 3585664, "step": 10655 }, { "epoch": 8.238021638330757, "grad_norm": 0.9801154136657715, "learning_rate": 3.659303770431503e-05, "loss": 0.5996, "num_input_tokens_seen": 3587264, "step": 10660 }, { "epoch": 8.241885625965997, "grad_norm": 0.7817800641059875, "learning_rate": 3.657809758281454e-05, "loss": 0.4177, "num_input_tokens_seen": 3589024, "step": 10665 }, { "epoch": 8.245749613601237, "grad_norm": 1.3611558675765991, "learning_rate": 3.656315219552592e-05, "loss": 0.4794, "num_input_tokens_seen": 3590784, "step": 10670 }, { "epoch": 8.249613601236476, "grad_norm": 0.7471657395362854, "learning_rate": 3.654820154924643e-05, "loss": 0.4437, "num_input_tokens_seen": 3592608, "step": 10675 }, { "epoch": 8.253477588871716, "grad_norm": 0.8104395866394043, "learning_rate": 3.6533245650775726e-05, "loss": 0.3774, "num_input_tokens_seen": 3593984, "step": 10680 }, { "epoch": 8.257341576506954, "grad_norm": 0.8348826766014099, "learning_rate": 3.651828450691582e-05, "loss": 0.3814, "num_input_tokens_seen": 3595424, "step": 10685 }, { "epoch": 8.261205564142195, "grad_norm": 1.1413121223449707, "learning_rate": 3.650331812447114e-05, "loss": 0.4143, "num_input_tokens_seen": 3597216, "step": 10690 }, { "epoch": 8.265069551777435, "grad_norm": 0.8998156189918518, "learning_rate": 3.64883465102485e-05, "loss": 0.5773, "num_input_tokens_seen": 3598912, "step": 10695 }, { "epoch": 8.268933539412673, "grad_norm": 0.8681609034538269, "learning_rate": 3.647336967105706e-05, "loss": 0.3802, "num_input_tokens_seen": 3600608, "step": 10700 }, { "epoch": 8.272797527047913, "grad_norm": 0.8802154064178467, "learning_rate": 3.645838761370838e-05, "loss": 0.3896, "num_input_tokens_seen": 3602592, "step": 10705 }, { "epoch": 8.276661514683154, "grad_norm": 0.899433434009552, "learning_rate": 3.6443400345016385e-05, "loss": 0.5067, "num_input_tokens_seen": 3604064, "step": 10710 }, { "epoch": 8.280525502318392, "grad_norm": 0.5044617652893066, "learning_rate": 3.6428407871797396e-05, "loss": 0.4709, "num_input_tokens_seen": 3605856, "step": 10715 }, { "epoch": 8.284389489953632, "grad_norm": 0.7367283701896667, "learning_rate": 3.6413410200870055e-05, "loss": 0.4592, "num_input_tokens_seen": 3607424, "step": 10720 }, { "epoch": 8.288253477588873, "grad_norm": 1.0071918964385986, "learning_rate": 3.639840733905541e-05, "loss": 0.385, "num_input_tokens_seen": 3609536, "step": 10725 }, { "epoch": 8.292117465224111, "grad_norm": 0.595674455165863, "learning_rate": 3.638339929317683e-05, "loss": 0.5799, "num_input_tokens_seen": 3611168, "step": 10730 }, { "epoch": 8.295981452859351, "grad_norm": 0.70705646276474, "learning_rate": 3.63683860700601e-05, "loss": 0.4721, "num_input_tokens_seen": 3612928, "step": 10735 }, { "epoch": 8.29984544049459, "grad_norm": 0.7058749198913574, "learning_rate": 3.6353367676533285e-05, "loss": 0.4751, "num_input_tokens_seen": 3614464, "step": 10740 }, { "epoch": 8.30370942812983, "grad_norm": 2.372607469558716, "learning_rate": 3.6338344119426866e-05, "loss": 0.5593, "num_input_tokens_seen": 3616192, "step": 10745 }, { "epoch": 8.30757341576507, "grad_norm": 1.4567261934280396, "learning_rate": 3.632331540557363e-05, "loss": 0.4327, "num_input_tokens_seen": 3618016, "step": 10750 }, { "epoch": 8.311437403400308, "grad_norm": 0.7455491423606873, "learning_rate": 3.6308281541808745e-05, "loss": 0.3845, "num_input_tokens_seen": 3619520, "step": 10755 }, { "epoch": 8.315301391035549, "grad_norm": 0.6505072116851807, "learning_rate": 3.629324253496969e-05, "loss": 0.4108, "num_input_tokens_seen": 3620896, "step": 10760 }, { "epoch": 8.319165378670789, "grad_norm": 0.7022231817245483, "learning_rate": 3.62781983918963e-05, "loss": 0.4488, "num_input_tokens_seen": 3622720, "step": 10765 }, { "epoch": 8.323029366306027, "grad_norm": 0.9034740328788757, "learning_rate": 3.6263149119430727e-05, "loss": 0.5656, "num_input_tokens_seen": 3624448, "step": 10770 }, { "epoch": 8.326893353941268, "grad_norm": 1.0365073680877686, "learning_rate": 3.6248094724417494e-05, "loss": 0.6123, "num_input_tokens_seen": 3626176, "step": 10775 }, { "epoch": 8.330757341576508, "grad_norm": 1.296982765197754, "learning_rate": 3.623303521370342e-05, "loss": 0.4119, "num_input_tokens_seen": 3627744, "step": 10780 }, { "epoch": 8.334621329211746, "grad_norm": 0.7473868727684021, "learning_rate": 3.621797059413765e-05, "loss": 0.4099, "num_input_tokens_seen": 3629472, "step": 10785 }, { "epoch": 8.338485316846986, "grad_norm": 0.7568243741989136, "learning_rate": 3.6202900872571674e-05, "loss": 0.3868, "num_input_tokens_seen": 3630976, "step": 10790 }, { "epoch": 8.342349304482227, "grad_norm": 0.8084076046943665, "learning_rate": 3.6187826055859286e-05, "loss": 0.4272, "num_input_tokens_seen": 3632352, "step": 10795 }, { "epoch": 8.346213292117465, "grad_norm": 0.8911157250404358, "learning_rate": 3.6172746150856615e-05, "loss": 0.3748, "num_input_tokens_seen": 3633760, "step": 10800 }, { "epoch": 8.350077279752705, "grad_norm": 1.3172489404678345, "learning_rate": 3.6157661164422086e-05, "loss": 0.5651, "num_input_tokens_seen": 3635616, "step": 10805 }, { "epoch": 8.353941267387944, "grad_norm": 0.9973784685134888, "learning_rate": 3.6142571103416424e-05, "loss": 0.3437, "num_input_tokens_seen": 3637024, "step": 10810 }, { "epoch": 8.357805255023184, "grad_norm": 0.751388669013977, "learning_rate": 3.612747597470271e-05, "loss": 0.4267, "num_input_tokens_seen": 3638880, "step": 10815 }, { "epoch": 8.361669242658424, "grad_norm": 0.7090305089950562, "learning_rate": 3.6112375785146276e-05, "loss": 0.3599, "num_input_tokens_seen": 3640416, "step": 10820 }, { "epoch": 8.365533230293662, "grad_norm": 2.0854578018188477, "learning_rate": 3.609727054161478e-05, "loss": 0.6775, "num_input_tokens_seen": 3642144, "step": 10825 }, { "epoch": 8.369397217928903, "grad_norm": 1.7556939125061035, "learning_rate": 3.608216025097819e-05, "loss": 0.497, "num_input_tokens_seen": 3643808, "step": 10830 }, { "epoch": 8.373261205564143, "grad_norm": 0.7124185562133789, "learning_rate": 3.606704492010875e-05, "loss": 0.4306, "num_input_tokens_seen": 3645504, "step": 10835 }, { "epoch": 8.377125193199381, "grad_norm": 0.7970505952835083, "learning_rate": 3.6051924555881e-05, "loss": 0.4286, "num_input_tokens_seen": 3647200, "step": 10840 }, { "epoch": 8.380989180834622, "grad_norm": 0.5921110510826111, "learning_rate": 3.6036799165171775e-05, "loss": 0.5024, "num_input_tokens_seen": 3648864, "step": 10845 }, { "epoch": 8.384853168469862, "grad_norm": 1.0545504093170166, "learning_rate": 3.602166875486019e-05, "loss": 0.5019, "num_input_tokens_seen": 3650624, "step": 10850 }, { "epoch": 8.3887171561051, "grad_norm": 0.5389737486839294, "learning_rate": 3.600653333182765e-05, "loss": 0.3681, "num_input_tokens_seen": 3652192, "step": 10855 }, { "epoch": 8.39258114374034, "grad_norm": 1.5943667888641357, "learning_rate": 3.599139290295784e-05, "loss": 0.4363, "num_input_tokens_seen": 3653632, "step": 10860 }, { "epoch": 8.396445131375579, "grad_norm": 0.6749444007873535, "learning_rate": 3.597624747513671e-05, "loss": 0.427, "num_input_tokens_seen": 3655328, "step": 10865 }, { "epoch": 8.400309119010819, "grad_norm": 1.2955317497253418, "learning_rate": 3.596109705525249e-05, "loss": 0.4185, "num_input_tokens_seen": 3656896, "step": 10870 }, { "epoch": 8.40417310664606, "grad_norm": 1.8387773036956787, "learning_rate": 3.5945941650195694e-05, "loss": 0.4737, "num_input_tokens_seen": 3658528, "step": 10875 }, { "epoch": 8.408037094281298, "grad_norm": 0.8104801774024963, "learning_rate": 3.593078126685908e-05, "loss": 0.4974, "num_input_tokens_seen": 3660160, "step": 10880 }, { "epoch": 8.411901081916538, "grad_norm": 0.8377550840377808, "learning_rate": 3.591561591213768e-05, "loss": 0.3942, "num_input_tokens_seen": 3661760, "step": 10885 }, { "epoch": 8.415765069551778, "grad_norm": 0.4976840317249298, "learning_rate": 3.590044559292879e-05, "loss": 0.4082, "num_input_tokens_seen": 3663424, "step": 10890 }, { "epoch": 8.419629057187016, "grad_norm": 1.0055134296417236, "learning_rate": 3.588527031613197e-05, "loss": 0.4489, "num_input_tokens_seen": 3665056, "step": 10895 }, { "epoch": 8.423493044822257, "grad_norm": 1.0264873504638672, "learning_rate": 3.5870090088649025e-05, "loss": 0.4843, "num_input_tokens_seen": 3666976, "step": 10900 }, { "epoch": 8.427357032457497, "grad_norm": 0.8963702321052551, "learning_rate": 3.5854904917384e-05, "loss": 0.3947, "num_input_tokens_seen": 3668608, "step": 10905 }, { "epoch": 8.431221020092735, "grad_norm": 1.0718343257904053, "learning_rate": 3.5839714809243216e-05, "loss": 0.447, "num_input_tokens_seen": 3670240, "step": 10910 }, { "epoch": 8.435085007727976, "grad_norm": 1.0847628116607666, "learning_rate": 3.582451977113521e-05, "loss": 0.4689, "num_input_tokens_seen": 3671904, "step": 10915 }, { "epoch": 8.438948995363216, "grad_norm": 0.7239774465560913, "learning_rate": 3.58093198099708e-05, "loss": 0.5515, "num_input_tokens_seen": 3673728, "step": 10920 }, { "epoch": 8.442812982998454, "grad_norm": 1.1795848608016968, "learning_rate": 3.5794114932663006e-05, "loss": 0.6047, "num_input_tokens_seen": 3675264, "step": 10925 }, { "epoch": 8.446676970633694, "grad_norm": 1.5835850238800049, "learning_rate": 3.5778905146127086e-05, "loss": 0.5986, "num_input_tokens_seen": 3676928, "step": 10930 }, { "epoch": 8.450540958268933, "grad_norm": 1.028698205947876, "learning_rate": 3.5763690457280566e-05, "loss": 0.4578, "num_input_tokens_seen": 3678592, "step": 10935 }, { "epoch": 8.454404945904173, "grad_norm": 0.8500005602836609, "learning_rate": 3.574847087304316e-05, "loss": 0.2974, "num_input_tokens_seen": 3680256, "step": 10940 }, { "epoch": 8.458268933539413, "grad_norm": 1.4482454061508179, "learning_rate": 3.5733246400336825e-05, "loss": 0.4826, "num_input_tokens_seen": 3682048, "step": 10945 }, { "epoch": 8.462132921174652, "grad_norm": 0.8905768990516663, "learning_rate": 3.571801704608575e-05, "loss": 0.4137, "num_input_tokens_seen": 3683552, "step": 10950 }, { "epoch": 8.465996908809892, "grad_norm": 1.2216862440109253, "learning_rate": 3.570278281721634e-05, "loss": 0.5681, "num_input_tokens_seen": 3685088, "step": 10955 }, { "epoch": 8.469860896445132, "grad_norm": 0.6391904950141907, "learning_rate": 3.568754372065721e-05, "loss": 0.4891, "num_input_tokens_seen": 3686688, "step": 10960 }, { "epoch": 8.47372488408037, "grad_norm": 1.412736415863037, "learning_rate": 3.5672299763339185e-05, "loss": 0.5279, "num_input_tokens_seen": 3688288, "step": 10965 }, { "epoch": 8.47758887171561, "grad_norm": 0.8413355350494385, "learning_rate": 3.565705095219531e-05, "loss": 0.3849, "num_input_tokens_seen": 3690240, "step": 10970 }, { "epoch": 8.481452859350851, "grad_norm": 0.6438280940055847, "learning_rate": 3.564179729416085e-05, "loss": 0.4127, "num_input_tokens_seen": 3691808, "step": 10975 }, { "epoch": 8.48531684698609, "grad_norm": 0.8586921691894531, "learning_rate": 3.562653879617324e-05, "loss": 0.4699, "num_input_tokens_seen": 3693920, "step": 10980 }, { "epoch": 8.48918083462133, "grad_norm": 1.2694647312164307, "learning_rate": 3.561127546517215e-05, "loss": 0.6035, "num_input_tokens_seen": 3695424, "step": 10985 }, { "epoch": 8.493044822256568, "grad_norm": 1.5073250532150269, "learning_rate": 3.559600730809943e-05, "loss": 0.5024, "num_input_tokens_seen": 3697056, "step": 10990 }, { "epoch": 8.496908809891808, "grad_norm": 0.9287497401237488, "learning_rate": 3.558073433189913e-05, "loss": 0.5188, "num_input_tokens_seen": 3698592, "step": 10995 }, { "epoch": 8.500772797527048, "grad_norm": 0.6160764098167419, "learning_rate": 3.556545654351749e-05, "loss": 0.3671, "num_input_tokens_seen": 3700352, "step": 11000 }, { "epoch": 8.504636785162287, "grad_norm": 0.6886796951293945, "learning_rate": 3.555017394990294e-05, "loss": 0.4236, "num_input_tokens_seen": 3701984, "step": 11005 }, { "epoch": 8.508500772797527, "grad_norm": 0.7630226612091064, "learning_rate": 3.5534886558006094e-05, "loss": 0.4247, "num_input_tokens_seen": 3703712, "step": 11010 }, { "epoch": 8.512364760432767, "grad_norm": 1.4041532278060913, "learning_rate": 3.5519594374779744e-05, "loss": 0.5894, "num_input_tokens_seen": 3705632, "step": 11015 }, { "epoch": 8.516228748068006, "grad_norm": 0.7912706732749939, "learning_rate": 3.5504297407178885e-05, "loss": 0.3778, "num_input_tokens_seen": 3707488, "step": 11020 }, { "epoch": 8.520092735703246, "grad_norm": 0.9656922817230225, "learning_rate": 3.548899566216065e-05, "loss": 0.59, "num_input_tokens_seen": 3709248, "step": 11025 }, { "epoch": 8.523956723338486, "grad_norm": 0.8634523749351501, "learning_rate": 3.547368914668438e-05, "loss": 0.3704, "num_input_tokens_seen": 3710784, "step": 11030 }, { "epoch": 8.527820710973725, "grad_norm": 1.00136399269104, "learning_rate": 3.545837786771155e-05, "loss": 0.4429, "num_input_tokens_seen": 3712544, "step": 11035 }, { "epoch": 8.531684698608965, "grad_norm": 1.0875974893569946, "learning_rate": 3.544306183220584e-05, "loss": 0.5135, "num_input_tokens_seen": 3714496, "step": 11040 }, { "epoch": 8.535548686244205, "grad_norm": 1.6049761772155762, "learning_rate": 3.5427741047133065e-05, "loss": 0.4309, "num_input_tokens_seen": 3716160, "step": 11045 }, { "epoch": 8.539412673879443, "grad_norm": 1.2049201726913452, "learning_rate": 3.541241551946122e-05, "loss": 0.44, "num_input_tokens_seen": 3717696, "step": 11050 }, { "epoch": 8.543276661514684, "grad_norm": 0.5159958600997925, "learning_rate": 3.539708525616042e-05, "loss": 0.4291, "num_input_tokens_seen": 3719392, "step": 11055 }, { "epoch": 8.547140649149922, "grad_norm": 0.8548684120178223, "learning_rate": 3.5381750264203004e-05, "loss": 0.403, "num_input_tokens_seen": 3721088, "step": 11060 }, { "epoch": 8.551004636785162, "grad_norm": 1.1115171909332275, "learning_rate": 3.536641055056338e-05, "loss": 0.4102, "num_input_tokens_seen": 3722880, "step": 11065 }, { "epoch": 8.554868624420402, "grad_norm": 1.4512712955474854, "learning_rate": 3.5351066122218155e-05, "loss": 0.3897, "num_input_tokens_seen": 3724704, "step": 11070 }, { "epoch": 8.55873261205564, "grad_norm": 0.7516743540763855, "learning_rate": 3.533571698614607e-05, "loss": 0.396, "num_input_tokens_seen": 3726112, "step": 11075 }, { "epoch": 8.562596599690881, "grad_norm": 1.8447202444076538, "learning_rate": 3.5320363149328006e-05, "loss": 0.465, "num_input_tokens_seen": 3727744, "step": 11080 }, { "epoch": 8.566460587326121, "grad_norm": 0.7062959671020508, "learning_rate": 3.5305004618746976e-05, "loss": 0.413, "num_input_tokens_seen": 3729472, "step": 11085 }, { "epoch": 8.57032457496136, "grad_norm": 0.6618844270706177, "learning_rate": 3.528964140138812e-05, "loss": 0.4085, "num_input_tokens_seen": 3731392, "step": 11090 }, { "epoch": 8.5741885625966, "grad_norm": 0.6293127536773682, "learning_rate": 3.527427350423874e-05, "loss": 0.5398, "num_input_tokens_seen": 3733024, "step": 11095 }, { "epoch": 8.578052550231838, "grad_norm": 0.927886426448822, "learning_rate": 3.525890093428824e-05, "loss": 0.5407, "num_input_tokens_seen": 3734624, "step": 11100 }, { "epoch": 8.581916537867079, "grad_norm": 0.8676866888999939, "learning_rate": 3.524352369852816e-05, "loss": 0.3781, "num_input_tokens_seen": 3736352, "step": 11105 }, { "epoch": 8.585780525502319, "grad_norm": 0.6273418664932251, "learning_rate": 3.522814180395215e-05, "loss": 0.3797, "num_input_tokens_seen": 3738048, "step": 11110 }, { "epoch": 8.589644513137557, "grad_norm": 0.6669108867645264, "learning_rate": 3.521275525755599e-05, "loss": 0.607, "num_input_tokens_seen": 3739904, "step": 11115 }, { "epoch": 8.593508500772797, "grad_norm": 1.3563764095306396, "learning_rate": 3.5197364066337585e-05, "loss": 0.6366, "num_input_tokens_seen": 3741440, "step": 11120 }, { "epoch": 8.597372488408038, "grad_norm": 0.9598343968391418, "learning_rate": 3.518196823729693e-05, "loss": 0.5411, "num_input_tokens_seen": 3743072, "step": 11125 }, { "epoch": 8.601236476043276, "grad_norm": 1.0115392208099365, "learning_rate": 3.5166567777436146e-05, "loss": 0.5479, "num_input_tokens_seen": 3744768, "step": 11130 }, { "epoch": 8.605100463678516, "grad_norm": 0.7960202693939209, "learning_rate": 3.5151162693759455e-05, "loss": 0.4702, "num_input_tokens_seen": 3746464, "step": 11135 }, { "epoch": 8.608964451313756, "grad_norm": 1.1953043937683105, "learning_rate": 3.513575299327317e-05, "loss": 0.7584, "num_input_tokens_seen": 3748128, "step": 11140 }, { "epoch": 8.612828438948995, "grad_norm": 0.7514758110046387, "learning_rate": 3.5120338682985725e-05, "loss": 0.4495, "num_input_tokens_seen": 3749792, "step": 11145 }, { "epoch": 8.616692426584235, "grad_norm": 0.9550819993019104, "learning_rate": 3.510491976990764e-05, "loss": 0.533, "num_input_tokens_seen": 3751520, "step": 11150 }, { "epoch": 8.620556414219475, "grad_norm": 0.872953474521637, "learning_rate": 3.508949626105152e-05, "loss": 0.3701, "num_input_tokens_seen": 3753280, "step": 11155 }, { "epoch": 8.624420401854714, "grad_norm": 0.8960431218147278, "learning_rate": 3.507406816343209e-05, "loss": 0.3886, "num_input_tokens_seen": 3754944, "step": 11160 }, { "epoch": 8.628284389489954, "grad_norm": 0.7550051212310791, "learning_rate": 3.505863548406613e-05, "loss": 0.7418, "num_input_tokens_seen": 3756576, "step": 11165 }, { "epoch": 8.632148377125194, "grad_norm": 1.0263943672180176, "learning_rate": 3.50431982299725e-05, "loss": 0.4045, "num_input_tokens_seen": 3758112, "step": 11170 }, { "epoch": 8.636012364760433, "grad_norm": 2.0944972038269043, "learning_rate": 3.502775640817217e-05, "loss": 0.6307, "num_input_tokens_seen": 3759584, "step": 11175 }, { "epoch": 8.639876352395673, "grad_norm": 0.907268762588501, "learning_rate": 3.5012310025688176e-05, "loss": 0.4662, "num_input_tokens_seen": 3761440, "step": 11180 }, { "epoch": 8.643740340030911, "grad_norm": 1.076227068901062, "learning_rate": 3.499685908954562e-05, "loss": 0.4939, "num_input_tokens_seen": 3763232, "step": 11185 }, { "epoch": 8.647604327666151, "grad_norm": 0.6096629500389099, "learning_rate": 3.498140360677168e-05, "loss": 0.5767, "num_input_tokens_seen": 3764928, "step": 11190 }, { "epoch": 8.651468315301392, "grad_norm": 0.780307948589325, "learning_rate": 3.4965943584395604e-05, "loss": 0.4653, "num_input_tokens_seen": 3766880, "step": 11195 }, { "epoch": 8.65533230293663, "grad_norm": 0.7766649127006531, "learning_rate": 3.4950479029448706e-05, "loss": 0.4505, "num_input_tokens_seen": 3768672, "step": 11200 }, { "epoch": 8.65919629057187, "grad_norm": 0.7403049468994141, "learning_rate": 3.4935009948964345e-05, "loss": 0.5318, "num_input_tokens_seen": 3770272, "step": 11205 }, { "epoch": 8.66306027820711, "grad_norm": 0.5978745818138123, "learning_rate": 3.491953634997796e-05, "loss": 0.4264, "num_input_tokens_seen": 3771936, "step": 11210 }, { "epoch": 8.666924265842349, "grad_norm": 1.325330138206482, "learning_rate": 3.4904058239527055e-05, "loss": 0.4889, "num_input_tokens_seen": 3773856, "step": 11215 }, { "epoch": 8.670788253477589, "grad_norm": 0.823311448097229, "learning_rate": 3.4888575624651144e-05, "loss": 0.4524, "num_input_tokens_seen": 3775424, "step": 11220 }, { "epoch": 8.674652241112828, "grad_norm": 0.5453718304634094, "learning_rate": 3.487308851239181e-05, "loss": 0.3741, "num_input_tokens_seen": 3777376, "step": 11225 }, { "epoch": 8.678516228748068, "grad_norm": 0.9864352345466614, "learning_rate": 3.4857596909792694e-05, "loss": 0.4874, "num_input_tokens_seen": 3779200, "step": 11230 }, { "epoch": 8.682380216383308, "grad_norm": 0.6342862844467163, "learning_rate": 3.484210082389947e-05, "loss": 0.4062, "num_input_tokens_seen": 3780832, "step": 11235 }, { "epoch": 8.686244204018546, "grad_norm": 1.4137680530548096, "learning_rate": 3.482660026175985e-05, "loss": 0.5745, "num_input_tokens_seen": 3782528, "step": 11240 }, { "epoch": 8.690108191653787, "grad_norm": 0.7298208475112915, "learning_rate": 3.4811095230423586e-05, "loss": 0.6558, "num_input_tokens_seen": 3784224, "step": 11245 }, { "epoch": 8.693972179289027, "grad_norm": 1.404952883720398, "learning_rate": 3.479558573694245e-05, "loss": 0.5359, "num_input_tokens_seen": 3785984, "step": 11250 }, { "epoch": 8.697836166924265, "grad_norm": 1.207131028175354, "learning_rate": 3.478007178837026e-05, "loss": 0.6694, "num_input_tokens_seen": 3787552, "step": 11255 }, { "epoch": 8.701700154559505, "grad_norm": 0.7846658229827881, "learning_rate": 3.476455339176284e-05, "loss": 0.3604, "num_input_tokens_seen": 3788928, "step": 11260 }, { "epoch": 8.705564142194746, "grad_norm": 0.729637861251831, "learning_rate": 3.474903055417807e-05, "loss": 0.5625, "num_input_tokens_seen": 3790496, "step": 11265 }, { "epoch": 8.709428129829984, "grad_norm": 0.8066650032997131, "learning_rate": 3.473350328267582e-05, "loss": 0.4602, "num_input_tokens_seen": 3792352, "step": 11270 }, { "epoch": 8.713292117465224, "grad_norm": 1.55539870262146, "learning_rate": 3.4717971584317984e-05, "loss": 0.4666, "num_input_tokens_seen": 3793760, "step": 11275 }, { "epoch": 8.717156105100464, "grad_norm": 1.1309452056884766, "learning_rate": 3.470243546616847e-05, "loss": 0.7158, "num_input_tokens_seen": 3795520, "step": 11280 }, { "epoch": 8.721020092735703, "grad_norm": 0.6817201972007751, "learning_rate": 3.468689493529321e-05, "loss": 0.499, "num_input_tokens_seen": 3797088, "step": 11285 }, { "epoch": 8.724884080370943, "grad_norm": 1.4482985734939575, "learning_rate": 3.4671349998760104e-05, "loss": 0.6494, "num_input_tokens_seen": 3798976, "step": 11290 }, { "epoch": 8.728748068006183, "grad_norm": 1.1587966680526733, "learning_rate": 3.465580066363911e-05, "loss": 0.507, "num_input_tokens_seen": 3800608, "step": 11295 }, { "epoch": 8.732612055641422, "grad_norm": 0.5773281455039978, "learning_rate": 3.4640246937002144e-05, "loss": 0.5054, "num_input_tokens_seen": 3802048, "step": 11300 }, { "epoch": 8.736476043276662, "grad_norm": 0.714570939540863, "learning_rate": 3.4624688825923146e-05, "loss": 0.5789, "num_input_tokens_seen": 3803776, "step": 11305 }, { "epoch": 8.7403400309119, "grad_norm": 1.4437544345855713, "learning_rate": 3.4609126337478016e-05, "loss": 0.4659, "num_input_tokens_seen": 3805312, "step": 11310 }, { "epoch": 8.74420401854714, "grad_norm": 0.9625111222267151, "learning_rate": 3.459355947874469e-05, "loss": 0.4895, "num_input_tokens_seen": 3807072, "step": 11315 }, { "epoch": 8.74806800618238, "grad_norm": 1.0166609287261963, "learning_rate": 3.457798825680306e-05, "loss": 0.415, "num_input_tokens_seen": 3808544, "step": 11320 }, { "epoch": 8.75193199381762, "grad_norm": 0.6972863078117371, "learning_rate": 3.456241267873501e-05, "loss": 0.4667, "num_input_tokens_seen": 3810016, "step": 11325 }, { "epoch": 8.75579598145286, "grad_norm": 0.9294158220291138, "learning_rate": 3.45468327516244e-05, "loss": 0.5231, "num_input_tokens_seen": 3811744, "step": 11330 }, { "epoch": 8.7596599690881, "grad_norm": 1.3232685327529907, "learning_rate": 3.4531248482557086e-05, "loss": 0.532, "num_input_tokens_seen": 3813472, "step": 11335 }, { "epoch": 8.763523956723338, "grad_norm": 0.9898413419723511, "learning_rate": 3.4515659878620886e-05, "loss": 0.4351, "num_input_tokens_seen": 3815072, "step": 11340 }, { "epoch": 8.767387944358578, "grad_norm": 0.7073198556900024, "learning_rate": 3.4500066946905585e-05, "loss": 0.4257, "num_input_tokens_seen": 3816800, "step": 11345 }, { "epoch": 8.771251931993817, "grad_norm": 0.8171571493148804, "learning_rate": 3.4484469694502934e-05, "loss": 0.4467, "num_input_tokens_seen": 3818400, "step": 11350 }, { "epoch": 8.775115919629057, "grad_norm": 0.8532257080078125, "learning_rate": 3.446886812850668e-05, "loss": 0.5046, "num_input_tokens_seen": 3819936, "step": 11355 }, { "epoch": 8.778979907264297, "grad_norm": 1.3084081411361694, "learning_rate": 3.4453262256012476e-05, "loss": 0.49, "num_input_tokens_seen": 3821600, "step": 11360 }, { "epoch": 8.782843894899536, "grad_norm": 1.1949658393859863, "learning_rate": 3.4437652084118e-05, "loss": 0.4661, "num_input_tokens_seen": 3823232, "step": 11365 }, { "epoch": 8.786707882534776, "grad_norm": 0.6097058057785034, "learning_rate": 3.4422037619922826e-05, "loss": 0.3578, "num_input_tokens_seen": 3824960, "step": 11370 }, { "epoch": 8.790571870170016, "grad_norm": 0.9788790345191956, "learning_rate": 3.440641887052852e-05, "loss": 0.4941, "num_input_tokens_seen": 3826592, "step": 11375 }, { "epoch": 8.794435857805254, "grad_norm": 0.9402461051940918, "learning_rate": 3.439079584303858e-05, "loss": 0.367, "num_input_tokens_seen": 3828192, "step": 11380 }, { "epoch": 8.798299845440495, "grad_norm": 0.9842297434806824, "learning_rate": 3.437516854455846e-05, "loss": 0.593, "num_input_tokens_seen": 3829664, "step": 11385 }, { "epoch": 8.802163833075735, "grad_norm": 1.4824503660202026, "learning_rate": 3.4359536982195527e-05, "loss": 0.6377, "num_input_tokens_seen": 3831488, "step": 11390 }, { "epoch": 8.806027820710973, "grad_norm": 0.769279956817627, "learning_rate": 3.4343901163059125e-05, "loss": 0.3473, "num_input_tokens_seen": 3833120, "step": 11395 }, { "epoch": 8.809891808346213, "grad_norm": 0.6109853386878967, "learning_rate": 3.432826109426052e-05, "loss": 0.4696, "num_input_tokens_seen": 3834752, "step": 11400 }, { "epoch": 8.813755795981454, "grad_norm": 0.8044131994247437, "learning_rate": 3.4312616782912897e-05, "loss": 0.5617, "num_input_tokens_seen": 3836480, "step": 11405 }, { "epoch": 8.817619783616692, "grad_norm": 0.7531542181968689, "learning_rate": 3.42969682361314e-05, "loss": 0.4506, "num_input_tokens_seen": 3838144, "step": 11410 }, { "epoch": 8.821483771251932, "grad_norm": 0.9863028526306152, "learning_rate": 3.428131546103306e-05, "loss": 0.5642, "num_input_tokens_seen": 3839712, "step": 11415 }, { "epoch": 8.825347758887172, "grad_norm": 1.0114612579345703, "learning_rate": 3.4265658464736876e-05, "loss": 0.4258, "num_input_tokens_seen": 3841312, "step": 11420 }, { "epoch": 8.829211746522411, "grad_norm": 0.8515927195549011, "learning_rate": 3.424999725436373e-05, "loss": 0.4286, "num_input_tokens_seen": 3843040, "step": 11425 }, { "epoch": 8.833075734157651, "grad_norm": 0.8833584785461426, "learning_rate": 3.423433183703643e-05, "loss": 0.3848, "num_input_tokens_seen": 3844384, "step": 11430 }, { "epoch": 8.83693972179289, "grad_norm": 0.5791917443275452, "learning_rate": 3.421866221987972e-05, "loss": 0.4313, "num_input_tokens_seen": 3846176, "step": 11435 }, { "epoch": 8.84080370942813, "grad_norm": 0.7564656734466553, "learning_rate": 3.420298841002021e-05, "loss": 0.5063, "num_input_tokens_seen": 3848032, "step": 11440 }, { "epoch": 8.84466769706337, "grad_norm": 1.2410632371902466, "learning_rate": 3.4187310414586474e-05, "loss": 0.586, "num_input_tokens_seen": 3849792, "step": 11445 }, { "epoch": 8.848531684698608, "grad_norm": 0.9107028841972351, "learning_rate": 3.417162824070892e-05, "loss": 0.478, "num_input_tokens_seen": 3851456, "step": 11450 }, { "epoch": 8.852395672333849, "grad_norm": 0.8835954666137695, "learning_rate": 3.415594189551993e-05, "loss": 0.4757, "num_input_tokens_seen": 3853440, "step": 11455 }, { "epoch": 8.856259659969089, "grad_norm": 0.5812865495681763, "learning_rate": 3.414025138615372e-05, "loss": 0.509, "num_input_tokens_seen": 3855264, "step": 11460 }, { "epoch": 8.860123647604327, "grad_norm": 0.8014335632324219, "learning_rate": 3.4124556719746455e-05, "loss": 0.3756, "num_input_tokens_seen": 3856768, "step": 11465 }, { "epoch": 8.863987635239567, "grad_norm": 1.4319733381271362, "learning_rate": 3.410885790343614e-05, "loss": 0.5284, "num_input_tokens_seen": 3858432, "step": 11470 }, { "epoch": 8.867851622874806, "grad_norm": 0.7160114049911499, "learning_rate": 3.4093154944362706e-05, "loss": 0.3783, "num_input_tokens_seen": 3859936, "step": 11475 }, { "epoch": 8.871715610510046, "grad_norm": 0.7978728413581848, "learning_rate": 3.407744784966795e-05, "loss": 0.5613, "num_input_tokens_seen": 3861632, "step": 11480 }, { "epoch": 8.875579598145286, "grad_norm": 0.8140117526054382, "learning_rate": 3.406173662649554e-05, "loss": 0.3688, "num_input_tokens_seen": 3863136, "step": 11485 }, { "epoch": 8.879443585780525, "grad_norm": 1.3149124383926392, "learning_rate": 3.404602128199105e-05, "loss": 0.3454, "num_input_tokens_seen": 3864864, "step": 11490 }, { "epoch": 8.883307573415765, "grad_norm": 0.8164753913879395, "learning_rate": 3.40303018233019e-05, "loss": 0.3971, "num_input_tokens_seen": 3866592, "step": 11495 }, { "epoch": 8.887171561051005, "grad_norm": 1.0268651247024536, "learning_rate": 3.40145782575774e-05, "loss": 0.5929, "num_input_tokens_seen": 3868320, "step": 11500 }, { "epoch": 8.891035548686244, "grad_norm": 0.7759486436843872, "learning_rate": 3.399885059196873e-05, "loss": 0.4098, "num_input_tokens_seen": 3869952, "step": 11505 }, { "epoch": 8.894899536321484, "grad_norm": 0.8196527361869812, "learning_rate": 3.3983118833628914e-05, "loss": 0.4131, "num_input_tokens_seen": 3871840, "step": 11510 }, { "epoch": 8.898763523956724, "grad_norm": 1.2097288370132446, "learning_rate": 3.3967382989712856e-05, "loss": 0.6846, "num_input_tokens_seen": 3873504, "step": 11515 }, { "epoch": 8.902627511591962, "grad_norm": 1.049023151397705, "learning_rate": 3.39516430673773e-05, "loss": 0.57, "num_input_tokens_seen": 3875104, "step": 11520 }, { "epoch": 8.906491499227203, "grad_norm": 0.7356892228126526, "learning_rate": 3.3935899073780885e-05, "loss": 0.3282, "num_input_tokens_seen": 3876544, "step": 11525 }, { "epoch": 8.910355486862443, "grad_norm": 0.7608158588409424, "learning_rate": 3.392015101608405e-05, "loss": 0.4127, "num_input_tokens_seen": 3878112, "step": 11530 }, { "epoch": 8.914219474497681, "grad_norm": 0.9418018460273743, "learning_rate": 3.39043989014491e-05, "loss": 0.601, "num_input_tokens_seen": 3879808, "step": 11535 }, { "epoch": 8.918083462132921, "grad_norm": 1.137512445449829, "learning_rate": 3.3888642737040224e-05, "loss": 0.5356, "num_input_tokens_seen": 3881728, "step": 11540 }, { "epoch": 8.921947449768162, "grad_norm": 1.0818477869033813, "learning_rate": 3.387288253002339e-05, "loss": 0.3548, "num_input_tokens_seen": 3883488, "step": 11545 }, { "epoch": 8.9258114374034, "grad_norm": 1.7605301141738892, "learning_rate": 3.385711828756644e-05, "loss": 0.4002, "num_input_tokens_seen": 3885024, "step": 11550 }, { "epoch": 8.92967542503864, "grad_norm": 1.1115648746490479, "learning_rate": 3.384135001683905e-05, "loss": 0.6222, "num_input_tokens_seen": 3886784, "step": 11555 }, { "epoch": 8.933539412673879, "grad_norm": 1.2200899124145508, "learning_rate": 3.382557772501273e-05, "loss": 0.4059, "num_input_tokens_seen": 3888960, "step": 11560 }, { "epoch": 8.937403400309119, "grad_norm": 0.9066911935806274, "learning_rate": 3.38098014192608e-05, "loss": 0.4589, "num_input_tokens_seen": 3890816, "step": 11565 }, { "epoch": 8.94126738794436, "grad_norm": 1.4673646688461304, "learning_rate": 3.379402110675843e-05, "loss": 0.6909, "num_input_tokens_seen": 3892512, "step": 11570 }, { "epoch": 8.945131375579598, "grad_norm": 0.6730450987815857, "learning_rate": 3.377823679468259e-05, "loss": 0.4092, "num_input_tokens_seen": 3894240, "step": 11575 }, { "epoch": 8.948995363214838, "grad_norm": 1.280097484588623, "learning_rate": 3.37624484902121e-05, "loss": 0.4101, "num_input_tokens_seen": 3895872, "step": 11580 }, { "epoch": 8.952859350850078, "grad_norm": 1.0872730016708374, "learning_rate": 3.3746656200527535e-05, "loss": 0.4941, "num_input_tokens_seen": 3897504, "step": 11585 }, { "epoch": 8.956723338485316, "grad_norm": 1.2222403287887573, "learning_rate": 3.3730859932811364e-05, "loss": 0.4361, "num_input_tokens_seen": 3899392, "step": 11590 }, { "epoch": 8.960587326120557, "grad_norm": 0.9480730295181274, "learning_rate": 3.371505969424781e-05, "loss": 0.4385, "num_input_tokens_seen": 3901216, "step": 11595 }, { "epoch": 8.964451313755795, "grad_norm": 0.787601888179779, "learning_rate": 3.369925549202291e-05, "loss": 0.352, "num_input_tokens_seen": 3902752, "step": 11600 }, { "epoch": 8.968315301391035, "grad_norm": 0.8238837718963623, "learning_rate": 3.368344733332451e-05, "loss": 0.4258, "num_input_tokens_seen": 3904480, "step": 11605 }, { "epoch": 8.972179289026275, "grad_norm": 0.607774555683136, "learning_rate": 3.366763522534227e-05, "loss": 0.4412, "num_input_tokens_seen": 3906432, "step": 11610 }, { "epoch": 8.976043276661514, "grad_norm": 0.758066713809967, "learning_rate": 3.365181917526761e-05, "loss": 0.5619, "num_input_tokens_seen": 3907968, "step": 11615 }, { "epoch": 8.979907264296754, "grad_norm": 0.765342652797699, "learning_rate": 3.363599919029378e-05, "loss": 0.393, "num_input_tokens_seen": 3909696, "step": 11620 }, { "epoch": 8.983771251931994, "grad_norm": 1.255387306213379, "learning_rate": 3.3620175277615806e-05, "loss": 0.6209, "num_input_tokens_seen": 3911616, "step": 11625 }, { "epoch": 8.987635239567233, "grad_norm": 1.1213077306747437, "learning_rate": 3.360434744443049e-05, "loss": 0.4203, "num_input_tokens_seen": 3913344, "step": 11630 }, { "epoch": 8.991499227202473, "grad_norm": 0.8453337550163269, "learning_rate": 3.358851569793642e-05, "loss": 0.519, "num_input_tokens_seen": 3915072, "step": 11635 }, { "epoch": 8.995363214837713, "grad_norm": 2.7547757625579834, "learning_rate": 3.357268004533398e-05, "loss": 0.5862, "num_input_tokens_seen": 3916608, "step": 11640 }, { "epoch": 8.999227202472952, "grad_norm": 0.5722439289093018, "learning_rate": 3.355684049382532e-05, "loss": 0.464, "num_input_tokens_seen": 3918304, "step": 11645 }, { "epoch": 9.0, "eval_loss": 0.47822773456573486, "eval_runtime": 6.3644, "eval_samples_per_second": 90.347, "eval_steps_per_second": 22.626, "num_input_tokens_seen": 3918416, "step": 11646 }, { "epoch": 9.003091190108192, "grad_norm": 0.717621922492981, "learning_rate": 3.354099705061435e-05, "loss": 0.4832, "num_input_tokens_seen": 3919696, "step": 11650 }, { "epoch": 9.006955177743432, "grad_norm": 0.7543171644210815, "learning_rate": 3.352514972290676e-05, "loss": 0.4551, "num_input_tokens_seen": 3921232, "step": 11655 }, { "epoch": 9.01081916537867, "grad_norm": 1.1137785911560059, "learning_rate": 3.3509298517910045e-05, "loss": 0.4532, "num_input_tokens_seen": 3922672, "step": 11660 }, { "epoch": 9.01468315301391, "grad_norm": 0.7533212304115295, "learning_rate": 3.3493443442833397e-05, "loss": 0.387, "num_input_tokens_seen": 3924432, "step": 11665 }, { "epoch": 9.018547140649149, "grad_norm": 0.8558134436607361, "learning_rate": 3.34775845048878e-05, "loss": 0.401, "num_input_tokens_seen": 3926192, "step": 11670 }, { "epoch": 9.02241112828439, "grad_norm": 1.7434027194976807, "learning_rate": 3.3461721711286e-05, "loss": 0.4694, "num_input_tokens_seen": 3927952, "step": 11675 }, { "epoch": 9.02627511591963, "grad_norm": 1.151074767112732, "learning_rate": 3.344585506924249e-05, "loss": 0.4598, "num_input_tokens_seen": 3929488, "step": 11680 }, { "epoch": 9.030139103554868, "grad_norm": 1.1493332386016846, "learning_rate": 3.342998458597352e-05, "loss": 0.4897, "num_input_tokens_seen": 3931056, "step": 11685 }, { "epoch": 9.034003091190108, "grad_norm": 1.1231937408447266, "learning_rate": 3.3414110268697075e-05, "loss": 0.5141, "num_input_tokens_seen": 3932688, "step": 11690 }, { "epoch": 9.037867078825348, "grad_norm": 1.2863143682479858, "learning_rate": 3.3398232124632884e-05, "loss": 0.487, "num_input_tokens_seen": 3934288, "step": 11695 }, { "epoch": 9.041731066460587, "grad_norm": 1.3349745273590088, "learning_rate": 3.3382350161002434e-05, "loss": 0.7064, "num_input_tokens_seen": 3935728, "step": 11700 }, { "epoch": 9.045595054095827, "grad_norm": 0.7659803032875061, "learning_rate": 3.336646438502893e-05, "loss": 0.365, "num_input_tokens_seen": 3937456, "step": 11705 }, { "epoch": 9.049459041731067, "grad_norm": 1.5564604997634888, "learning_rate": 3.3350574803937315e-05, "loss": 0.5433, "num_input_tokens_seen": 3938832, "step": 11710 }, { "epoch": 9.053323029366306, "grad_norm": 1.077906608581543, "learning_rate": 3.3334681424954274e-05, "loss": 0.6101, "num_input_tokens_seen": 3940560, "step": 11715 }, { "epoch": 9.057187017001546, "grad_norm": 1.0309473276138306, "learning_rate": 3.33187842553082e-05, "loss": 0.7048, "num_input_tokens_seen": 3942320, "step": 11720 }, { "epoch": 9.061051004636786, "grad_norm": 1.062100887298584, "learning_rate": 3.330288330222923e-05, "loss": 0.6214, "num_input_tokens_seen": 3943888, "step": 11725 }, { "epoch": 9.064914992272024, "grad_norm": 0.7844257950782776, "learning_rate": 3.3286978572949214e-05, "loss": 0.5393, "num_input_tokens_seen": 3945648, "step": 11730 }, { "epoch": 9.068778979907265, "grad_norm": 0.7166469097137451, "learning_rate": 3.327107007470171e-05, "loss": 0.3604, "num_input_tokens_seen": 3947088, "step": 11735 }, { "epoch": 9.072642967542503, "grad_norm": 0.9504762887954712, "learning_rate": 3.3255157814722003e-05, "loss": 0.506, "num_input_tokens_seen": 3948880, "step": 11740 }, { "epoch": 9.076506955177743, "grad_norm": 0.8650409579277039, "learning_rate": 3.3239241800247086e-05, "loss": 0.4625, "num_input_tokens_seen": 3950544, "step": 11745 }, { "epoch": 9.080370942812984, "grad_norm": 0.9763902425765991, "learning_rate": 3.3223322038515656e-05, "loss": 0.4194, "num_input_tokens_seen": 3952304, "step": 11750 }, { "epoch": 9.084234930448222, "grad_norm": 1.1360087394714355, "learning_rate": 3.320739853676812e-05, "loss": 0.4436, "num_input_tokens_seen": 3954032, "step": 11755 }, { "epoch": 9.088098918083462, "grad_norm": 0.587765634059906, "learning_rate": 3.319147130224656e-05, "loss": 0.4542, "num_input_tokens_seen": 3955728, "step": 11760 }, { "epoch": 9.091962905718702, "grad_norm": 0.829338788986206, "learning_rate": 3.317554034219481e-05, "loss": 0.4143, "num_input_tokens_seen": 3957360, "step": 11765 }, { "epoch": 9.09582689335394, "grad_norm": 1.1514536142349243, "learning_rate": 3.315960566385835e-05, "loss": 0.3474, "num_input_tokens_seen": 3958896, "step": 11770 }, { "epoch": 9.099690880989181, "grad_norm": 1.1731988191604614, "learning_rate": 3.314366727448436e-05, "loss": 0.527, "num_input_tokens_seen": 3960592, "step": 11775 }, { "epoch": 9.103554868624421, "grad_norm": 0.7336270809173584, "learning_rate": 3.312772518132173e-05, "loss": 0.4453, "num_input_tokens_seen": 3962256, "step": 11780 }, { "epoch": 9.10741885625966, "grad_norm": 0.5509374141693115, "learning_rate": 3.3111779391621014e-05, "loss": 0.3415, "num_input_tokens_seen": 3963856, "step": 11785 }, { "epoch": 9.1112828438949, "grad_norm": 0.7136176228523254, "learning_rate": 3.3095829912634445e-05, "loss": 0.4151, "num_input_tokens_seen": 3965872, "step": 11790 }, { "epoch": 9.115146831530138, "grad_norm": 0.9761099815368652, "learning_rate": 3.307987675161595e-05, "loss": 0.5287, "num_input_tokens_seen": 3967728, "step": 11795 }, { "epoch": 9.119010819165378, "grad_norm": 1.238808274269104, "learning_rate": 3.3063919915821115e-05, "loss": 0.4265, "num_input_tokens_seen": 3969328, "step": 11800 }, { "epoch": 9.122874806800619, "grad_norm": 0.6131095290184021, "learning_rate": 3.304795941250722e-05, "loss": 0.4412, "num_input_tokens_seen": 3971152, "step": 11805 }, { "epoch": 9.126738794435857, "grad_norm": 0.7129392623901367, "learning_rate": 3.3031995248933176e-05, "loss": 0.4106, "num_input_tokens_seen": 3972912, "step": 11810 }, { "epoch": 9.130602782071097, "grad_norm": 0.8727149367332458, "learning_rate": 3.30160274323596e-05, "loss": 0.5286, "num_input_tokens_seen": 3974672, "step": 11815 }, { "epoch": 9.134466769706338, "grad_norm": 0.9824816584587097, "learning_rate": 3.3000055970048734e-05, "loss": 0.4061, "num_input_tokens_seen": 3976336, "step": 11820 }, { "epoch": 9.138330757341576, "grad_norm": 0.8287321329116821, "learning_rate": 3.298408086926451e-05, "loss": 0.4117, "num_input_tokens_seen": 3977968, "step": 11825 }, { "epoch": 9.142194744976816, "grad_norm": 1.050695538520813, "learning_rate": 3.296810213727249e-05, "loss": 0.482, "num_input_tokens_seen": 3979696, "step": 11830 }, { "epoch": 9.146058732612056, "grad_norm": 0.9327228665351868, "learning_rate": 3.2952119781339895e-05, "loss": 0.4059, "num_input_tokens_seen": 3981328, "step": 11835 }, { "epoch": 9.149922720247295, "grad_norm": 1.0084916353225708, "learning_rate": 3.29361338087356e-05, "loss": 0.3504, "num_input_tokens_seen": 3983152, "step": 11840 }, { "epoch": 9.153786707882535, "grad_norm": 0.7097992300987244, "learning_rate": 3.2920144226730124e-05, "loss": 0.4329, "num_input_tokens_seen": 3984656, "step": 11845 }, { "epoch": 9.157650695517773, "grad_norm": 1.0880563259124756, "learning_rate": 3.290415104259563e-05, "loss": 0.4467, "num_input_tokens_seen": 3986192, "step": 11850 }, { "epoch": 9.161514683153014, "grad_norm": 1.3591710329055786, "learning_rate": 3.288815426360589e-05, "loss": 0.5224, "num_input_tokens_seen": 3987888, "step": 11855 }, { "epoch": 9.165378670788254, "grad_norm": 0.9889196157455444, "learning_rate": 3.287215389703636e-05, "loss": 0.6437, "num_input_tokens_seen": 3989456, "step": 11860 }, { "epoch": 9.169242658423492, "grad_norm": 0.8620941042900085, "learning_rate": 3.285614995016409e-05, "loss": 0.4758, "num_input_tokens_seen": 3991312, "step": 11865 }, { "epoch": 9.173106646058732, "grad_norm": 0.7256734371185303, "learning_rate": 3.284014243026778e-05, "loss": 0.3322, "num_input_tokens_seen": 3992912, "step": 11870 }, { "epoch": 9.176970633693973, "grad_norm": 1.3183952569961548, "learning_rate": 3.282413134462773e-05, "loss": 0.9327, "num_input_tokens_seen": 3994928, "step": 11875 }, { "epoch": 9.180834621329211, "grad_norm": 1.7092275619506836, "learning_rate": 3.2808116700525886e-05, "loss": 0.5541, "num_input_tokens_seen": 3996720, "step": 11880 }, { "epoch": 9.184698608964451, "grad_norm": 0.972530722618103, "learning_rate": 3.279209850524582e-05, "loss": 0.3548, "num_input_tokens_seen": 3998416, "step": 11885 }, { "epoch": 9.188562596599692, "grad_norm": 0.8060950040817261, "learning_rate": 3.2776076766072685e-05, "loss": 0.4182, "num_input_tokens_seen": 3999856, "step": 11890 }, { "epoch": 9.19242658423493, "grad_norm": 0.8933190703392029, "learning_rate": 3.2760051490293255e-05, "loss": 0.421, "num_input_tokens_seen": 4001520, "step": 11895 }, { "epoch": 9.19629057187017, "grad_norm": 0.7009643912315369, "learning_rate": 3.274402268519594e-05, "loss": 0.3828, "num_input_tokens_seen": 4003024, "step": 11900 }, { "epoch": 9.20015455950541, "grad_norm": 0.8043956756591797, "learning_rate": 3.272799035807074e-05, "loss": 0.4886, "num_input_tokens_seen": 4004720, "step": 11905 }, { "epoch": 9.204018547140649, "grad_norm": 0.8471553325653076, "learning_rate": 3.2711954516209236e-05, "loss": 0.3934, "num_input_tokens_seen": 4006352, "step": 11910 }, { "epoch": 9.207882534775889, "grad_norm": 0.8565335273742676, "learning_rate": 3.269591516690463e-05, "loss": 0.3763, "num_input_tokens_seen": 4008016, "step": 11915 }, { "epoch": 9.211746522411127, "grad_norm": 0.9123969078063965, "learning_rate": 3.267987231745172e-05, "loss": 0.4517, "num_input_tokens_seen": 4009552, "step": 11920 }, { "epoch": 9.215610510046368, "grad_norm": 0.7259919047355652, "learning_rate": 3.2663825975146896e-05, "loss": 0.439, "num_input_tokens_seen": 4011344, "step": 11925 }, { "epoch": 9.219474497681608, "grad_norm": 1.3453497886657715, "learning_rate": 3.264777614728811e-05, "loss": 0.4644, "num_input_tokens_seen": 4013040, "step": 11930 }, { "epoch": 9.223338485316846, "grad_norm": 0.612535834312439, "learning_rate": 3.263172284117493e-05, "loss": 0.3949, "num_input_tokens_seen": 4014576, "step": 11935 }, { "epoch": 9.227202472952087, "grad_norm": 0.685818612575531, "learning_rate": 3.261566606410851e-05, "loss": 0.3666, "num_input_tokens_seen": 4016144, "step": 11940 }, { "epoch": 9.231066460587327, "grad_norm": 1.412715196609497, "learning_rate": 3.259960582339155e-05, "loss": 0.4104, "num_input_tokens_seen": 4017776, "step": 11945 }, { "epoch": 9.234930448222565, "grad_norm": 1.362682819366455, "learning_rate": 3.258354212632834e-05, "loss": 0.4808, "num_input_tokens_seen": 4019440, "step": 11950 }, { "epoch": 9.238794435857805, "grad_norm": 0.9877966642379761, "learning_rate": 3.256747498022476e-05, "loss": 0.4292, "num_input_tokens_seen": 4020976, "step": 11955 }, { "epoch": 9.242658423493046, "grad_norm": 1.2878137826919556, "learning_rate": 3.255140439238825e-05, "loss": 0.4327, "num_input_tokens_seen": 4022704, "step": 11960 }, { "epoch": 9.246522411128284, "grad_norm": 0.94929438829422, "learning_rate": 3.2535330370127786e-05, "loss": 0.4128, "num_input_tokens_seen": 4024528, "step": 11965 }, { "epoch": 9.250386398763524, "grad_norm": 0.6502611637115479, "learning_rate": 3.251925292075395e-05, "loss": 0.4403, "num_input_tokens_seen": 4026128, "step": 11970 }, { "epoch": 9.254250386398763, "grad_norm": 0.6419329643249512, "learning_rate": 3.2503172051578846e-05, "loss": 0.5398, "num_input_tokens_seen": 4027888, "step": 11975 }, { "epoch": 9.258114374034003, "grad_norm": 0.7147688865661621, "learning_rate": 3.248708776991617e-05, "loss": 0.4869, "num_input_tokens_seen": 4029392, "step": 11980 }, { "epoch": 9.261978361669243, "grad_norm": 0.9843432307243347, "learning_rate": 3.2471000083081126e-05, "loss": 0.4532, "num_input_tokens_seen": 4031152, "step": 11985 }, { "epoch": 9.265842349304481, "grad_norm": 0.6750032305717468, "learning_rate": 3.2454908998390506e-05, "loss": 0.3498, "num_input_tokens_seen": 4032848, "step": 11990 }, { "epoch": 9.269706336939722, "grad_norm": 0.8644545078277588, "learning_rate": 3.243881452316263e-05, "loss": 0.3821, "num_input_tokens_seen": 4034352, "step": 11995 }, { "epoch": 9.273570324574962, "grad_norm": 0.6990744471549988, "learning_rate": 3.242271666471736e-05, "loss": 0.4016, "num_input_tokens_seen": 4035888, "step": 12000 }, { "epoch": 9.2774343122102, "grad_norm": 1.2089684009552002, "learning_rate": 3.2406615430376095e-05, "loss": 0.8195, "num_input_tokens_seen": 4037456, "step": 12005 }, { "epoch": 9.28129829984544, "grad_norm": 0.7730933427810669, "learning_rate": 3.2390510827461785e-05, "loss": 0.4534, "num_input_tokens_seen": 4039184, "step": 12010 }, { "epoch": 9.28516228748068, "grad_norm": 1.3978934288024902, "learning_rate": 3.237440286329888e-05, "loss": 0.4745, "num_input_tokens_seen": 4040656, "step": 12015 }, { "epoch": 9.28902627511592, "grad_norm": 0.7044410109519958, "learning_rate": 3.235829154521339e-05, "loss": 0.6269, "num_input_tokens_seen": 4042320, "step": 12020 }, { "epoch": 9.29289026275116, "grad_norm": 0.694850504398346, "learning_rate": 3.234217688053284e-05, "loss": 0.4895, "num_input_tokens_seen": 4043952, "step": 12025 }, { "epoch": 9.2967542503864, "grad_norm": 1.160627841949463, "learning_rate": 3.232605887658628e-05, "loss": 0.545, "num_input_tokens_seen": 4045584, "step": 12030 }, { "epoch": 9.300618238021638, "grad_norm": 1.3286100625991821, "learning_rate": 3.2309937540704256e-05, "loss": 0.6756, "num_input_tokens_seen": 4047024, "step": 12035 }, { "epoch": 9.304482225656878, "grad_norm": 1.0660452842712402, "learning_rate": 3.229381288021887e-05, "loss": 0.3674, "num_input_tokens_seen": 4048784, "step": 12040 }, { "epoch": 9.308346213292117, "grad_norm": 0.7905836701393127, "learning_rate": 3.2277684902463705e-05, "loss": 0.3987, "num_input_tokens_seen": 4050448, "step": 12045 }, { "epoch": 9.312210200927357, "grad_norm": 1.7573531866073608, "learning_rate": 3.226155361477386e-05, "loss": 0.6245, "num_input_tokens_seen": 4052144, "step": 12050 }, { "epoch": 9.316074188562597, "grad_norm": 1.445534348487854, "learning_rate": 3.224541902448594e-05, "loss": 0.4915, "num_input_tokens_seen": 4053808, "step": 12055 }, { "epoch": 9.319938176197835, "grad_norm": 0.9456449151039124, "learning_rate": 3.2229281138938063e-05, "loss": 0.3737, "num_input_tokens_seen": 4055472, "step": 12060 }, { "epoch": 9.323802163833076, "grad_norm": 1.3553122282028198, "learning_rate": 3.221313996546983e-05, "loss": 0.5061, "num_input_tokens_seen": 4057168, "step": 12065 }, { "epoch": 9.327666151468316, "grad_norm": 1.5595247745513916, "learning_rate": 3.219699551142234e-05, "loss": 0.4618, "num_input_tokens_seen": 4059056, "step": 12070 }, { "epoch": 9.331530139103554, "grad_norm": 1.187449336051941, "learning_rate": 3.2180847784138193e-05, "loss": 0.3894, "num_input_tokens_seen": 4060688, "step": 12075 }, { "epoch": 9.335394126738795, "grad_norm": 0.7939422130584717, "learning_rate": 3.216469679096146e-05, "loss": 0.551, "num_input_tokens_seen": 4062544, "step": 12080 }, { "epoch": 9.339258114374035, "grad_norm": 1.0271817445755005, "learning_rate": 3.214854253923772e-05, "loss": 0.6064, "num_input_tokens_seen": 4064432, "step": 12085 }, { "epoch": 9.343122102009273, "grad_norm": 0.6604605317115784, "learning_rate": 3.213238503631404e-05, "loss": 0.5917, "num_input_tokens_seen": 4066096, "step": 12090 }, { "epoch": 9.346986089644513, "grad_norm": 1.0536738634109497, "learning_rate": 3.2116224289538916e-05, "loss": 0.3792, "num_input_tokens_seen": 4067856, "step": 12095 }, { "epoch": 9.350850077279752, "grad_norm": 1.2410589456558228, "learning_rate": 3.210006030626237e-05, "loss": 0.3681, "num_input_tokens_seen": 4069488, "step": 12100 }, { "epoch": 9.354714064914992, "grad_norm": 0.7596853971481323, "learning_rate": 3.2083893093835876e-05, "loss": 0.5059, "num_input_tokens_seen": 4071216, "step": 12105 }, { "epoch": 9.358578052550232, "grad_norm": 0.6137245893478394, "learning_rate": 3.2067722659612384e-05, "loss": 0.4355, "num_input_tokens_seen": 4072784, "step": 12110 }, { "epoch": 9.36244204018547, "grad_norm": 0.6445506811141968, "learning_rate": 3.205154901094629e-05, "loss": 0.6151, "num_input_tokens_seen": 4074416, "step": 12115 }, { "epoch": 9.36630602782071, "grad_norm": 0.7319191694259644, "learning_rate": 3.203537215519349e-05, "loss": 0.4447, "num_input_tokens_seen": 4075984, "step": 12120 }, { "epoch": 9.370170015455951, "grad_norm": 0.896584689617157, "learning_rate": 3.201919209971128e-05, "loss": 0.3806, "num_input_tokens_seen": 4077776, "step": 12125 }, { "epoch": 9.37403400309119, "grad_norm": 0.9424496293067932, "learning_rate": 3.200300885185849e-05, "loss": 0.4732, "num_input_tokens_seen": 4079536, "step": 12130 }, { "epoch": 9.37789799072643, "grad_norm": 1.0655720233917236, "learning_rate": 3.1986822418995314e-05, "loss": 0.4208, "num_input_tokens_seen": 4081392, "step": 12135 }, { "epoch": 9.38176197836167, "grad_norm": 0.7213584184646606, "learning_rate": 3.197063280848347e-05, "loss": 0.4195, "num_input_tokens_seen": 4083120, "step": 12140 }, { "epoch": 9.385625965996908, "grad_norm": 0.7395865321159363, "learning_rate": 3.195444002768608e-05, "loss": 0.3782, "num_input_tokens_seen": 4084816, "step": 12145 }, { "epoch": 9.389489953632149, "grad_norm": 0.7688573002815247, "learning_rate": 3.193824408396772e-05, "loss": 0.4787, "num_input_tokens_seen": 4086608, "step": 12150 }, { "epoch": 9.393353941267389, "grad_norm": 1.0682986974716187, "learning_rate": 3.1922044984694386e-05, "loss": 0.478, "num_input_tokens_seen": 4088144, "step": 12155 }, { "epoch": 9.397217928902627, "grad_norm": 1.009850263595581, "learning_rate": 3.190584273723355e-05, "loss": 0.5239, "num_input_tokens_seen": 4089904, "step": 12160 }, { "epoch": 9.401081916537867, "grad_norm": 1.0013542175292969, "learning_rate": 3.1889637348954076e-05, "loss": 0.3807, "num_input_tokens_seen": 4091472, "step": 12165 }, { "epoch": 9.404945904173106, "grad_norm": 1.9001843929290771, "learning_rate": 3.187342882722628e-05, "loss": 0.4807, "num_input_tokens_seen": 4093040, "step": 12170 }, { "epoch": 9.408809891808346, "grad_norm": 0.9478427171707153, "learning_rate": 3.185721717942188e-05, "loss": 0.5175, "num_input_tokens_seen": 4094768, "step": 12175 }, { "epoch": 9.412673879443586, "grad_norm": 1.0304878950119019, "learning_rate": 3.184100241291405e-05, "loss": 0.4263, "num_input_tokens_seen": 4096560, "step": 12180 }, { "epoch": 9.416537867078825, "grad_norm": 0.8260247707366943, "learning_rate": 3.1824784535077344e-05, "loss": 0.3794, "num_input_tokens_seen": 4098352, "step": 12185 }, { "epoch": 9.420401854714065, "grad_norm": 0.7937590479850769, "learning_rate": 3.180856355328776e-05, "loss": 0.318, "num_input_tokens_seen": 4099984, "step": 12190 }, { "epoch": 9.424265842349305, "grad_norm": 1.2519742250442505, "learning_rate": 3.1792339474922704e-05, "loss": 0.3669, "num_input_tokens_seen": 4101648, "step": 12195 }, { "epoch": 9.428129829984544, "grad_norm": 1.0137184858322144, "learning_rate": 3.177611230736098e-05, "loss": 0.4954, "num_input_tokens_seen": 4103280, "step": 12200 }, { "epoch": 9.431993817619784, "grad_norm": 1.1422877311706543, "learning_rate": 3.175988205798279e-05, "loss": 0.4569, "num_input_tokens_seen": 4105104, "step": 12205 }, { "epoch": 9.435857805255024, "grad_norm": 1.1101878881454468, "learning_rate": 3.174364873416976e-05, "loss": 0.4559, "num_input_tokens_seen": 4106928, "step": 12210 }, { "epoch": 9.439721792890262, "grad_norm": 2.443667411804199, "learning_rate": 3.1727412343304896e-05, "loss": 0.4734, "num_input_tokens_seen": 4108496, "step": 12215 }, { "epoch": 9.443585780525503, "grad_norm": 0.8309926390647888, "learning_rate": 3.171117289277262e-05, "loss": 0.436, "num_input_tokens_seen": 4110256, "step": 12220 }, { "epoch": 9.447449768160741, "grad_norm": 0.5280288457870483, "learning_rate": 3.169493038995871e-05, "loss": 0.3639, "num_input_tokens_seen": 4112048, "step": 12225 }, { "epoch": 9.451313755795981, "grad_norm": 1.3202719688415527, "learning_rate": 3.167868484225037e-05, "loss": 0.5264, "num_input_tokens_seen": 4113616, "step": 12230 }, { "epoch": 9.455177743431221, "grad_norm": 1.176466941833496, "learning_rate": 3.166243625703616e-05, "loss": 0.4279, "num_input_tokens_seen": 4115216, "step": 12235 }, { "epoch": 9.45904173106646, "grad_norm": 1.13979971408844, "learning_rate": 3.1646184641706054e-05, "loss": 0.4001, "num_input_tokens_seen": 4117008, "step": 12240 }, { "epoch": 9.4629057187017, "grad_norm": 1.0716356039047241, "learning_rate": 3.162993000365135e-05, "loss": 0.493, "num_input_tokens_seen": 4118832, "step": 12245 }, { "epoch": 9.46676970633694, "grad_norm": 0.9093238711357117, "learning_rate": 3.1613672350264795e-05, "loss": 0.4589, "num_input_tokens_seen": 4120688, "step": 12250 }, { "epoch": 9.470633693972179, "grad_norm": 0.7926966547966003, "learning_rate": 3.1597411688940435e-05, "loss": 0.4693, "num_input_tokens_seen": 4122352, "step": 12255 }, { "epoch": 9.474497681607419, "grad_norm": 0.7327173948287964, "learning_rate": 3.158114802707373e-05, "loss": 0.5451, "num_input_tokens_seen": 4123856, "step": 12260 }, { "epoch": 9.478361669242659, "grad_norm": 0.8204582333564758, "learning_rate": 3.1564881372061493e-05, "loss": 0.3643, "num_input_tokens_seen": 4125360, "step": 12265 }, { "epoch": 9.482225656877898, "grad_norm": 0.9148836731910706, "learning_rate": 3.1548611731301895e-05, "loss": 0.5458, "num_input_tokens_seen": 4126960, "step": 12270 }, { "epoch": 9.486089644513138, "grad_norm": 0.7316181063652039, "learning_rate": 3.153233911219446e-05, "loss": 0.393, "num_input_tokens_seen": 4128816, "step": 12275 }, { "epoch": 9.489953632148378, "grad_norm": 0.8742881417274475, "learning_rate": 3.151606352214007e-05, "loss": 0.3919, "num_input_tokens_seen": 4130480, "step": 12280 }, { "epoch": 9.493817619783616, "grad_norm": 1.1111514568328857, "learning_rate": 3.149978496854098e-05, "loss": 0.469, "num_input_tokens_seen": 4132080, "step": 12285 }, { "epoch": 9.497681607418857, "grad_norm": 0.8807827234268188, "learning_rate": 3.1483503458800755e-05, "loss": 0.4303, "num_input_tokens_seen": 4134096, "step": 12290 }, { "epoch": 9.501545595054095, "grad_norm": 0.9200833439826965, "learning_rate": 3.146721900032431e-05, "loss": 0.5002, "num_input_tokens_seen": 4135760, "step": 12295 }, { "epoch": 9.505409582689335, "grad_norm": 0.6029820442199707, "learning_rate": 3.1450931600517966e-05, "loss": 0.5257, "num_input_tokens_seen": 4137392, "step": 12300 }, { "epoch": 9.509273570324575, "grad_norm": 0.6735271215438843, "learning_rate": 3.143464126678928e-05, "loss": 0.3893, "num_input_tokens_seen": 4138768, "step": 12305 }, { "epoch": 9.513137557959814, "grad_norm": 1.5906341075897217, "learning_rate": 3.141834800654721e-05, "loss": 0.6937, "num_input_tokens_seen": 4140464, "step": 12310 }, { "epoch": 9.517001545595054, "grad_norm": 1.5476895570755005, "learning_rate": 3.140205182720203e-05, "loss": 0.4785, "num_input_tokens_seen": 4142192, "step": 12315 }, { "epoch": 9.520865533230294, "grad_norm": 1.821861743927002, "learning_rate": 3.1385752736165336e-05, "loss": 0.4916, "num_input_tokens_seen": 4144048, "step": 12320 }, { "epoch": 9.524729520865533, "grad_norm": 0.8613965511322021, "learning_rate": 3.136945074085006e-05, "loss": 0.3683, "num_input_tokens_seen": 4145808, "step": 12325 }, { "epoch": 9.528593508500773, "grad_norm": 0.9784826040267944, "learning_rate": 3.135314584867044e-05, "loss": 0.3682, "num_input_tokens_seen": 4147504, "step": 12330 }, { "epoch": 9.532457496136013, "grad_norm": 0.7817595601081848, "learning_rate": 3.133683806704203e-05, "loss": 0.3554, "num_input_tokens_seen": 4149264, "step": 12335 }, { "epoch": 9.536321483771252, "grad_norm": 1.5103411674499512, "learning_rate": 3.132052740338174e-05, "loss": 0.5214, "num_input_tokens_seen": 4151120, "step": 12340 }, { "epoch": 9.540185471406492, "grad_norm": 0.7191941738128662, "learning_rate": 3.1304213865107715e-05, "loss": 0.4399, "num_input_tokens_seen": 4153008, "step": 12345 }, { "epoch": 9.54404945904173, "grad_norm": 0.6591215133666992, "learning_rate": 3.128789745963948e-05, "loss": 0.3861, "num_input_tokens_seen": 4154672, "step": 12350 }, { "epoch": 9.54791344667697, "grad_norm": 1.0322983264923096, "learning_rate": 3.127157819439782e-05, "loss": 0.4288, "num_input_tokens_seen": 4156240, "step": 12355 }, { "epoch": 9.55177743431221, "grad_norm": 0.803098201751709, "learning_rate": 3.125525607680484e-05, "loss": 0.4259, "num_input_tokens_seen": 4157968, "step": 12360 }, { "epoch": 9.555641421947449, "grad_norm": 1.1036481857299805, "learning_rate": 3.123893111428393e-05, "loss": 0.3719, "num_input_tokens_seen": 4160048, "step": 12365 }, { "epoch": 9.55950540958269, "grad_norm": 1.1647474765777588, "learning_rate": 3.122260331425979e-05, "loss": 0.3794, "num_input_tokens_seen": 4161616, "step": 12370 }, { "epoch": 9.56336939721793, "grad_norm": 0.8265092968940735, "learning_rate": 3.12062726841584e-05, "loss": 0.4377, "num_input_tokens_seen": 4163376, "step": 12375 }, { "epoch": 9.567233384853168, "grad_norm": 1.4266440868377686, "learning_rate": 3.118993923140702e-05, "loss": 0.6636, "num_input_tokens_seen": 4165424, "step": 12380 }, { "epoch": 9.571097372488408, "grad_norm": 1.2176921367645264, "learning_rate": 3.117360296343421e-05, "loss": 0.4597, "num_input_tokens_seen": 4167184, "step": 12385 }, { "epoch": 9.574961360123648, "grad_norm": 0.7719448804855347, "learning_rate": 3.11572638876698e-05, "loss": 0.7097, "num_input_tokens_seen": 4168880, "step": 12390 }, { "epoch": 9.578825347758887, "grad_norm": 1.4786931276321411, "learning_rate": 3.1140922011544895e-05, "loss": 0.597, "num_input_tokens_seen": 4170480, "step": 12395 }, { "epoch": 9.582689335394127, "grad_norm": 0.6626794934272766, "learning_rate": 3.1124577342491884e-05, "loss": 0.3824, "num_input_tokens_seen": 4172240, "step": 12400 }, { "epoch": 9.586553323029367, "grad_norm": 0.854629635810852, "learning_rate": 3.110822988794442e-05, "loss": 0.4702, "num_input_tokens_seen": 4173776, "step": 12405 }, { "epoch": 9.590417310664606, "grad_norm": 2.2504968643188477, "learning_rate": 3.109187965533743e-05, "loss": 0.427, "num_input_tokens_seen": 4175568, "step": 12410 }, { "epoch": 9.594281298299846, "grad_norm": 0.8522206544876099, "learning_rate": 3.107552665210708e-05, "loss": 0.348, "num_input_tokens_seen": 4177360, "step": 12415 }, { "epoch": 9.598145285935084, "grad_norm": 1.0692455768585205, "learning_rate": 3.1059170885690827e-05, "loss": 0.4823, "num_input_tokens_seen": 4179088, "step": 12420 }, { "epoch": 9.602009273570324, "grad_norm": 0.8503953814506531, "learning_rate": 3.104281236352737e-05, "loss": 0.4959, "num_input_tokens_seen": 4180752, "step": 12425 }, { "epoch": 9.605873261205565, "grad_norm": 1.1137105226516724, "learning_rate": 3.102645109305666e-05, "loss": 0.5095, "num_input_tokens_seen": 4182384, "step": 12430 }, { "epoch": 9.609737248840803, "grad_norm": 1.4269020557403564, "learning_rate": 3.10100870817199e-05, "loss": 0.6277, "num_input_tokens_seen": 4184208, "step": 12435 }, { "epoch": 9.613601236476043, "grad_norm": 0.8665485978126526, "learning_rate": 3.099372033695954e-05, "loss": 0.6907, "num_input_tokens_seen": 4185744, "step": 12440 }, { "epoch": 9.617465224111283, "grad_norm": 0.8805075883865356, "learning_rate": 3.097735086621928e-05, "loss": 0.3471, "num_input_tokens_seen": 4187472, "step": 12445 }, { "epoch": 9.621329211746522, "grad_norm": 0.7997760772705078, "learning_rate": 3.096097867694405e-05, "loss": 0.4282, "num_input_tokens_seen": 4189200, "step": 12450 }, { "epoch": 9.625193199381762, "grad_norm": 1.3920832872390747, "learning_rate": 3.0944603776580016e-05, "loss": 0.5591, "num_input_tokens_seen": 4191344, "step": 12455 }, { "epoch": 9.629057187017002, "grad_norm": 0.8222126364707947, "learning_rate": 3.0928226172574585e-05, "loss": 0.4302, "num_input_tokens_seen": 4192784, "step": 12460 }, { "epoch": 9.63292117465224, "grad_norm": 0.9230024814605713, "learning_rate": 3.091184587237639e-05, "loss": 0.5806, "num_input_tokens_seen": 4194768, "step": 12465 }, { "epoch": 9.636785162287481, "grad_norm": 1.1807656288146973, "learning_rate": 3.0895462883435285e-05, "loss": 0.4474, "num_input_tokens_seen": 4196336, "step": 12470 }, { "epoch": 9.64064914992272, "grad_norm": 1.405816674232483, "learning_rate": 3.087907721320236e-05, "loss": 0.4237, "num_input_tokens_seen": 4198096, "step": 12475 }, { "epoch": 9.64451313755796, "grad_norm": 0.9792608618736267, "learning_rate": 3.0862688869129895e-05, "loss": 0.3856, "num_input_tokens_seen": 4199664, "step": 12480 }, { "epoch": 9.6483771251932, "grad_norm": 0.6221159100532532, "learning_rate": 3.084629785867143e-05, "loss": 0.4414, "num_input_tokens_seen": 4201328, "step": 12485 }, { "epoch": 9.652241112828438, "grad_norm": 0.8375487923622131, "learning_rate": 3.0829904189281694e-05, "loss": 0.3305, "num_input_tokens_seen": 4202832, "step": 12490 }, { "epoch": 9.656105100463678, "grad_norm": 0.7320209741592407, "learning_rate": 3.081350786841661e-05, "loss": 0.7506, "num_input_tokens_seen": 4204624, "step": 12495 }, { "epoch": 9.659969088098919, "grad_norm": 1.0626972913742065, "learning_rate": 3.079710890353334e-05, "loss": 0.346, "num_input_tokens_seen": 4206448, "step": 12500 }, { "epoch": 9.663833075734157, "grad_norm": 0.9586750268936157, "learning_rate": 3.078070730209021e-05, "loss": 0.4068, "num_input_tokens_seen": 4208400, "step": 12505 }, { "epoch": 9.667697063369397, "grad_norm": 0.6733459830284119, "learning_rate": 3.0764303071546794e-05, "loss": 0.5262, "num_input_tokens_seen": 4210192, "step": 12510 }, { "epoch": 9.671561051004637, "grad_norm": 0.9805769324302673, "learning_rate": 3.074789621936381e-05, "loss": 0.429, "num_input_tokens_seen": 4211760, "step": 12515 }, { "epoch": 9.675425038639876, "grad_norm": 0.9802903532981873, "learning_rate": 3.07314867530032e-05, "loss": 0.4607, "num_input_tokens_seen": 4213648, "step": 12520 }, { "epoch": 9.679289026275116, "grad_norm": 0.664831817150116, "learning_rate": 3.07150746799281e-05, "loss": 0.4666, "num_input_tokens_seen": 4215184, "step": 12525 }, { "epoch": 9.683153013910356, "grad_norm": 2.259472131729126, "learning_rate": 3.069866000760281e-05, "loss": 0.9537, "num_input_tokens_seen": 4216880, "step": 12530 }, { "epoch": 9.687017001545595, "grad_norm": 1.2996784448623657, "learning_rate": 3.0682242743492816e-05, "loss": 0.5199, "num_input_tokens_seen": 4218576, "step": 12535 }, { "epoch": 9.690880989180835, "grad_norm": 0.770037055015564, "learning_rate": 3.066582289506479e-05, "loss": 0.4472, "num_input_tokens_seen": 4220336, "step": 12540 }, { "epoch": 9.694744976816073, "grad_norm": 1.2389986515045166, "learning_rate": 3.064940046978658e-05, "loss": 0.366, "num_input_tokens_seen": 4222000, "step": 12545 }, { "epoch": 9.698608964451314, "grad_norm": 0.8074690699577332, "learning_rate": 3.0632975475127216e-05, "loss": 0.3976, "num_input_tokens_seen": 4223824, "step": 12550 }, { "epoch": 9.702472952086554, "grad_norm": 0.9193640947341919, "learning_rate": 3.061654791855686e-05, "loss": 0.7647, "num_input_tokens_seen": 4225552, "step": 12555 }, { "epoch": 9.706336939721792, "grad_norm": 0.9600542187690735, "learning_rate": 3.060011780754687e-05, "loss": 0.5447, "num_input_tokens_seen": 4227024, "step": 12560 }, { "epoch": 9.710200927357032, "grad_norm": 0.8977251648902893, "learning_rate": 3.058368514956977e-05, "loss": 0.3307, "num_input_tokens_seen": 4228464, "step": 12565 }, { "epoch": 9.714064914992273, "grad_norm": 1.041532278060913, "learning_rate": 3.056724995209923e-05, "loss": 0.4674, "num_input_tokens_seen": 4230544, "step": 12570 }, { "epoch": 9.717928902627511, "grad_norm": 0.7460445761680603, "learning_rate": 3.055081222261006e-05, "loss": 0.3873, "num_input_tokens_seen": 4232336, "step": 12575 }, { "epoch": 9.721792890262751, "grad_norm": 0.8312209844589233, "learning_rate": 3.0534371968578256e-05, "loss": 0.399, "num_input_tokens_seen": 4234096, "step": 12580 }, { "epoch": 9.725656877897991, "grad_norm": 1.0112149715423584, "learning_rate": 3.0517929197480935e-05, "loss": 0.3695, "num_input_tokens_seen": 4235824, "step": 12585 }, { "epoch": 9.72952086553323, "grad_norm": 1.2831199169158936, "learning_rate": 3.050148391679637e-05, "loss": 0.5556, "num_input_tokens_seen": 4237424, "step": 12590 }, { "epoch": 9.73338485316847, "grad_norm": 0.9299487471580505, "learning_rate": 3.048503613400397e-05, "loss": 0.5089, "num_input_tokens_seen": 4239184, "step": 12595 }, { "epoch": 9.737248840803709, "grad_norm": 0.9654529094696045, "learning_rate": 3.0468585856584288e-05, "loss": 0.4529, "num_input_tokens_seen": 4240816, "step": 12600 }, { "epoch": 9.741112828438949, "grad_norm": 1.087465763092041, "learning_rate": 3.045213309201901e-05, "loss": 0.5729, "num_input_tokens_seen": 4242704, "step": 12605 }, { "epoch": 9.744976816074189, "grad_norm": 0.7184786200523376, "learning_rate": 3.043567784779095e-05, "loss": 0.4874, "num_input_tokens_seen": 4244176, "step": 12610 }, { "epoch": 9.748840803709427, "grad_norm": 0.5954999327659607, "learning_rate": 3.0419220131384053e-05, "loss": 0.3545, "num_input_tokens_seen": 4245776, "step": 12615 }, { "epoch": 9.752704791344668, "grad_norm": 0.8848159909248352, "learning_rate": 3.040275995028338e-05, "loss": 0.4894, "num_input_tokens_seen": 4247440, "step": 12620 }, { "epoch": 9.756568778979908, "grad_norm": 1.027570366859436, "learning_rate": 3.0386297311975126e-05, "loss": 0.4385, "num_input_tokens_seen": 4249072, "step": 12625 }, { "epoch": 9.760432766615146, "grad_norm": 0.7290099859237671, "learning_rate": 3.0369832223946603e-05, "loss": 0.3556, "num_input_tokens_seen": 4250672, "step": 12630 }, { "epoch": 9.764296754250386, "grad_norm": 0.7383884787559509, "learning_rate": 3.0353364693686233e-05, "loss": 0.5632, "num_input_tokens_seen": 4252144, "step": 12635 }, { "epoch": 9.768160741885627, "grad_norm": 0.7876123189926147, "learning_rate": 3.033689472868352e-05, "loss": 0.4789, "num_input_tokens_seen": 4253840, "step": 12640 }, { "epoch": 9.772024729520865, "grad_norm": 1.2013921737670898, "learning_rate": 3.032042233642914e-05, "loss": 0.4279, "num_input_tokens_seen": 4255600, "step": 12645 }, { "epoch": 9.775888717156105, "grad_norm": 0.9092114567756653, "learning_rate": 3.030394752441481e-05, "loss": 0.5151, "num_input_tokens_seen": 4257200, "step": 12650 }, { "epoch": 9.779752704791346, "grad_norm": 0.675221860408783, "learning_rate": 3.0287470300133384e-05, "loss": 0.3996, "num_input_tokens_seen": 4258832, "step": 12655 }, { "epoch": 9.783616692426584, "grad_norm": 1.1078851222991943, "learning_rate": 3.0270990671078798e-05, "loss": 0.4955, "num_input_tokens_seen": 4260336, "step": 12660 }, { "epoch": 9.787480680061824, "grad_norm": 0.7982273101806641, "learning_rate": 3.0254508644746092e-05, "loss": 0.3963, "num_input_tokens_seen": 4261936, "step": 12665 }, { "epoch": 9.791344667697063, "grad_norm": 1.0108318328857422, "learning_rate": 3.023802422863139e-05, "loss": 0.5565, "num_input_tokens_seen": 4263664, "step": 12670 }, { "epoch": 9.795208655332303, "grad_norm": 0.6969626545906067, "learning_rate": 3.0221537430231893e-05, "loss": 0.3582, "num_input_tokens_seen": 4265296, "step": 12675 }, { "epoch": 9.799072642967543, "grad_norm": 0.9379662871360779, "learning_rate": 3.0205048257045898e-05, "loss": 0.3762, "num_input_tokens_seen": 4266928, "step": 12680 }, { "epoch": 9.802936630602781, "grad_norm": 1.048548936843872, "learning_rate": 3.0188556716572798e-05, "loss": 0.4595, "num_input_tokens_seen": 4268560, "step": 12685 }, { "epoch": 9.806800618238022, "grad_norm": 0.7675466537475586, "learning_rate": 3.017206281631302e-05, "loss": 0.4189, "num_input_tokens_seen": 4270608, "step": 12690 }, { "epoch": 9.810664605873262, "grad_norm": 1.4458966255187988, "learning_rate": 3.01555665637681e-05, "loss": 0.4163, "num_input_tokens_seen": 4272336, "step": 12695 }, { "epoch": 9.8145285935085, "grad_norm": 0.8167896866798401, "learning_rate": 3.0139067966440633e-05, "loss": 0.5929, "num_input_tokens_seen": 4274064, "step": 12700 }, { "epoch": 9.81839258114374, "grad_norm": 1.5331600904464722, "learning_rate": 3.0122567031834275e-05, "loss": 0.6915, "num_input_tokens_seen": 4275632, "step": 12705 }, { "epoch": 9.82225656877898, "grad_norm": 1.091620683670044, "learning_rate": 3.0106063767453756e-05, "loss": 0.3747, "num_input_tokens_seen": 4277328, "step": 12710 }, { "epoch": 9.826120556414219, "grad_norm": 0.6804801821708679, "learning_rate": 3.0089558180804857e-05, "loss": 0.4235, "num_input_tokens_seen": 4279024, "step": 12715 }, { "epoch": 9.82998454404946, "grad_norm": 0.7683240175247192, "learning_rate": 3.0073050279394416e-05, "loss": 0.3637, "num_input_tokens_seen": 4280720, "step": 12720 }, { "epoch": 9.833848531684698, "grad_norm": 1.0107792615890503, "learning_rate": 3.0056540070730323e-05, "loss": 0.5059, "num_input_tokens_seen": 4282512, "step": 12725 }, { "epoch": 9.837712519319938, "grad_norm": 1.5086636543273926, "learning_rate": 3.0040027562321525e-05, "loss": 0.6592, "num_input_tokens_seen": 4284144, "step": 12730 }, { "epoch": 9.841576506955178, "grad_norm": 0.9582411646842957, "learning_rate": 3.0023512761678017e-05, "loss": 0.5116, "num_input_tokens_seen": 4285904, "step": 12735 }, { "epoch": 9.845440494590417, "grad_norm": 0.6952570080757141, "learning_rate": 3.0006995676310813e-05, "loss": 0.458, "num_input_tokens_seen": 4287504, "step": 12740 }, { "epoch": 9.849304482225657, "grad_norm": 0.8648666739463806, "learning_rate": 2.9990476313731986e-05, "loss": 0.5038, "num_input_tokens_seen": 4289360, "step": 12745 }, { "epoch": 9.853168469860897, "grad_norm": 0.9494401216506958, "learning_rate": 2.997395468145465e-05, "loss": 0.3528, "num_input_tokens_seen": 4290992, "step": 12750 }, { "epoch": 9.857032457496135, "grad_norm": 0.7824193835258484, "learning_rate": 2.995743078699294e-05, "loss": 0.3787, "num_input_tokens_seen": 4292688, "step": 12755 }, { "epoch": 9.860896445131376, "grad_norm": 0.8072684407234192, "learning_rate": 2.994090463786201e-05, "loss": 0.4521, "num_input_tokens_seen": 4294288, "step": 12760 }, { "epoch": 9.864760432766616, "grad_norm": 1.2025846242904663, "learning_rate": 2.9924376241578068e-05, "loss": 0.4985, "num_input_tokens_seen": 4295888, "step": 12765 }, { "epoch": 9.868624420401854, "grad_norm": 0.6552515029907227, "learning_rate": 2.990784560565832e-05, "loss": 0.4145, "num_input_tokens_seen": 4297520, "step": 12770 }, { "epoch": 9.872488408037094, "grad_norm": 0.8425571918487549, "learning_rate": 2.9891312737620996e-05, "loss": 0.4091, "num_input_tokens_seen": 4299248, "step": 12775 }, { "epoch": 9.876352395672335, "grad_norm": 1.124666452407837, "learning_rate": 2.987477764498534e-05, "loss": 0.3878, "num_input_tokens_seen": 4300848, "step": 12780 }, { "epoch": 9.880216383307573, "grad_norm": 1.3697420358657837, "learning_rate": 2.985824033527163e-05, "loss": 0.5398, "num_input_tokens_seen": 4302416, "step": 12785 }, { "epoch": 9.884080370942813, "grad_norm": 1.6511329412460327, "learning_rate": 2.9841700816001115e-05, "loss": 0.7455, "num_input_tokens_seen": 4304336, "step": 12790 }, { "epoch": 9.887944358578052, "grad_norm": 0.580143928527832, "learning_rate": 2.9825159094696076e-05, "loss": 0.335, "num_input_tokens_seen": 4306160, "step": 12795 }, { "epoch": 9.891808346213292, "grad_norm": 1.1495286226272583, "learning_rate": 2.9808615178879778e-05, "loss": 0.4595, "num_input_tokens_seen": 4307792, "step": 12800 }, { "epoch": 9.895672333848532, "grad_norm": 1.0755383968353271, "learning_rate": 2.9792069076076502e-05, "loss": 0.6717, "num_input_tokens_seen": 4309552, "step": 12805 }, { "epoch": 9.89953632148377, "grad_norm": 0.9413804411888123, "learning_rate": 2.9775520793811514e-05, "loss": 0.5696, "num_input_tokens_seen": 4311440, "step": 12810 }, { "epoch": 9.90340030911901, "grad_norm": 0.9086689949035645, "learning_rate": 2.975897033961107e-05, "loss": 0.4222, "num_input_tokens_seen": 4313168, "step": 12815 }, { "epoch": 9.907264296754251, "grad_norm": 0.9526796936988831, "learning_rate": 2.974241772100241e-05, "loss": 0.5032, "num_input_tokens_seen": 4314640, "step": 12820 }, { "epoch": 9.91112828438949, "grad_norm": 1.341312289237976, "learning_rate": 2.972586294551377e-05, "loss": 0.5972, "num_input_tokens_seen": 4316528, "step": 12825 }, { "epoch": 9.91499227202473, "grad_norm": 0.8662987947463989, "learning_rate": 2.970930602067436e-05, "loss": 0.4286, "num_input_tokens_seen": 4318064, "step": 12830 }, { "epoch": 9.91885625965997, "grad_norm": 1.45316481590271, "learning_rate": 2.969274695401437e-05, "loss": 0.4498, "num_input_tokens_seen": 4319728, "step": 12835 }, { "epoch": 9.922720247295208, "grad_norm": 1.4562664031982422, "learning_rate": 2.967618575306496e-05, "loss": 0.4022, "num_input_tokens_seen": 4321520, "step": 12840 }, { "epoch": 9.926584234930449, "grad_norm": 1.6898130178451538, "learning_rate": 2.9659622425358276e-05, "loss": 0.6692, "num_input_tokens_seen": 4322960, "step": 12845 }, { "epoch": 9.930448222565687, "grad_norm": 1.0463814735412598, "learning_rate": 2.9643056978427392e-05, "loss": 0.4417, "num_input_tokens_seen": 4324624, "step": 12850 }, { "epoch": 9.934312210200927, "grad_norm": 1.6094229221343994, "learning_rate": 2.9626489419806396e-05, "loss": 0.7801, "num_input_tokens_seen": 4326288, "step": 12855 }, { "epoch": 9.938176197836167, "grad_norm": 0.6696589589118958, "learning_rate": 2.96099197570303e-05, "loss": 0.4142, "num_input_tokens_seen": 4328016, "step": 12860 }, { "epoch": 9.942040185471406, "grad_norm": 0.6146266460418701, "learning_rate": 2.9593347997635096e-05, "loss": 0.3973, "num_input_tokens_seen": 4329744, "step": 12865 }, { "epoch": 9.945904173106646, "grad_norm": 0.643926739692688, "learning_rate": 2.9576774149157715e-05, "loss": 0.3909, "num_input_tokens_seen": 4331440, "step": 12870 }, { "epoch": 9.949768160741886, "grad_norm": 0.9077593088150024, "learning_rate": 2.9560198219136043e-05, "loss": 0.5386, "num_input_tokens_seen": 4332944, "step": 12875 }, { "epoch": 9.953632148377125, "grad_norm": 1.2603468894958496, "learning_rate": 2.9543620215108904e-05, "loss": 0.4427, "num_input_tokens_seen": 4334896, "step": 12880 }, { "epoch": 9.957496136012365, "grad_norm": 1.208632469177246, "learning_rate": 2.952704014461608e-05, "loss": 0.3905, "num_input_tokens_seen": 4336528, "step": 12885 }, { "epoch": 9.961360123647605, "grad_norm": 0.7758678197860718, "learning_rate": 2.9510458015198295e-05, "loss": 0.3585, "num_input_tokens_seen": 4338352, "step": 12890 }, { "epoch": 9.965224111282843, "grad_norm": 0.855909526348114, "learning_rate": 2.949387383439719e-05, "loss": 0.4887, "num_input_tokens_seen": 4339984, "step": 12895 }, { "epoch": 9.969088098918084, "grad_norm": 1.0119596719741821, "learning_rate": 2.9477287609755343e-05, "loss": 0.4046, "num_input_tokens_seen": 4341808, "step": 12900 }, { "epoch": 9.972952086553324, "grad_norm": 0.6067941188812256, "learning_rate": 2.946069934881629e-05, "loss": 0.4093, "num_input_tokens_seen": 4343248, "step": 12905 }, { "epoch": 9.976816074188562, "grad_norm": 1.1640352010726929, "learning_rate": 2.9444109059124458e-05, "loss": 0.5719, "num_input_tokens_seen": 4344880, "step": 12910 }, { "epoch": 9.980680061823803, "grad_norm": 0.7601434588432312, "learning_rate": 2.9427516748225205e-05, "loss": 0.4737, "num_input_tokens_seen": 4346928, "step": 12915 }, { "epoch": 9.984544049459041, "grad_norm": 1.8085036277770996, "learning_rate": 2.9410922423664823e-05, "loss": 0.543, "num_input_tokens_seen": 4348848, "step": 12920 }, { "epoch": 9.988408037094281, "grad_norm": 1.7802551984786987, "learning_rate": 2.9394326092990504e-05, "loss": 0.5528, "num_input_tokens_seen": 4350576, "step": 12925 }, { "epoch": 9.992272024729521, "grad_norm": 0.9246548414230347, "learning_rate": 2.937772776375037e-05, "loss": 0.4819, "num_input_tokens_seen": 4352176, "step": 12930 }, { "epoch": 9.99613601236476, "grad_norm": 0.5296974778175354, "learning_rate": 2.936112744349342e-05, "loss": 0.3959, "num_input_tokens_seen": 4353584, "step": 12935 }, { "epoch": 10.0, "grad_norm": 2.8497095108032227, "learning_rate": 2.934452513976959e-05, "loss": 0.5449, "num_input_tokens_seen": 4355072, "step": 12940 }, { "epoch": 10.0, "eval_loss": 0.47380849719047546, "eval_runtime": 6.8719, "eval_samples_per_second": 83.674, "eval_steps_per_second": 20.955, "num_input_tokens_seen": 4355072, "step": 12940 }, { "epoch": 10.00386398763524, "grad_norm": 0.5145106911659241, "learning_rate": 2.9327920860129722e-05, "loss": 0.3496, "num_input_tokens_seen": 4356992, "step": 12945 }, { "epoch": 10.007727975270479, "grad_norm": 1.328047275543213, "learning_rate": 2.9311314612125517e-05, "loss": 0.4023, "num_input_tokens_seen": 4358944, "step": 12950 }, { "epoch": 10.011591962905719, "grad_norm": 0.6946138739585876, "learning_rate": 2.9294706403309614e-05, "loss": 0.4028, "num_input_tokens_seen": 4360576, "step": 12955 }, { "epoch": 10.015455950540959, "grad_norm": 0.9613583087921143, "learning_rate": 2.9278096241235508e-05, "loss": 0.4478, "num_input_tokens_seen": 4362368, "step": 12960 }, { "epoch": 10.019319938176197, "grad_norm": 0.7661211490631104, "learning_rate": 2.9261484133457624e-05, "loss": 0.7579, "num_input_tokens_seen": 4364288, "step": 12965 }, { "epoch": 10.023183925811438, "grad_norm": 0.7683583498001099, "learning_rate": 2.9244870087531222e-05, "loss": 0.4039, "num_input_tokens_seen": 4366016, "step": 12970 }, { "epoch": 10.027047913446676, "grad_norm": 1.9357279539108276, "learning_rate": 2.9228254111012494e-05, "loss": 0.4357, "num_input_tokens_seen": 4367424, "step": 12975 }, { "epoch": 10.030911901081916, "grad_norm": 0.7043845057487488, "learning_rate": 2.9211636211458464e-05, "loss": 0.4489, "num_input_tokens_seen": 4369248, "step": 12980 }, { "epoch": 10.034775888717157, "grad_norm": 1.3498258590698242, "learning_rate": 2.9195016396427067e-05, "loss": 0.7081, "num_input_tokens_seen": 4371072, "step": 12985 }, { "epoch": 10.038639876352395, "grad_norm": 1.700111746788025, "learning_rate": 2.9178394673477094e-05, "loss": 0.3932, "num_input_tokens_seen": 4372736, "step": 12990 }, { "epoch": 10.042503863987635, "grad_norm": 0.9827092289924622, "learning_rate": 2.9161771050168203e-05, "loss": 0.3531, "num_input_tokens_seen": 4374400, "step": 12995 }, { "epoch": 10.046367851622875, "grad_norm": 0.5483196973800659, "learning_rate": 2.9145145534060907e-05, "loss": 0.3651, "num_input_tokens_seen": 4376064, "step": 13000 }, { "epoch": 10.050231839258114, "grad_norm": 2.206345558166504, "learning_rate": 2.91285181327166e-05, "loss": 0.6187, "num_input_tokens_seen": 4377728, "step": 13005 }, { "epoch": 10.054095826893354, "grad_norm": 2.6812798976898193, "learning_rate": 2.9111888853697523e-05, "loss": 0.4612, "num_input_tokens_seen": 4379392, "step": 13010 }, { "epoch": 10.057959814528594, "grad_norm": 0.695139467716217, "learning_rate": 2.909525770456677e-05, "loss": 0.4164, "num_input_tokens_seen": 4381120, "step": 13015 }, { "epoch": 10.061823802163833, "grad_norm": 0.7305116057395935, "learning_rate": 2.9078624692888277e-05, "loss": 0.459, "num_input_tokens_seen": 4382848, "step": 13020 }, { "epoch": 10.065687789799073, "grad_norm": 2.9637837409973145, "learning_rate": 2.906198982622686e-05, "loss": 0.5374, "num_input_tokens_seen": 4384352, "step": 13025 }, { "epoch": 10.069551777434313, "grad_norm": 1.012410044670105, "learning_rate": 2.9045353112148144e-05, "loss": 0.3604, "num_input_tokens_seen": 4385952, "step": 13030 }, { "epoch": 10.073415765069551, "grad_norm": 1.1094621419906616, "learning_rate": 2.9028714558218596e-05, "loss": 0.4306, "num_input_tokens_seen": 4387520, "step": 13035 }, { "epoch": 10.077279752704792, "grad_norm": 0.9507516026496887, "learning_rate": 2.9012074172005542e-05, "loss": 0.4535, "num_input_tokens_seen": 4389248, "step": 13040 }, { "epoch": 10.08114374034003, "grad_norm": 0.9023739099502563, "learning_rate": 2.8995431961077136e-05, "loss": 0.4855, "num_input_tokens_seen": 4390944, "step": 13045 }, { "epoch": 10.08500772797527, "grad_norm": 0.8676549792289734, "learning_rate": 2.8978787933002345e-05, "loss": 0.3239, "num_input_tokens_seen": 4392800, "step": 13050 }, { "epoch": 10.08887171561051, "grad_norm": 0.8858481645584106, "learning_rate": 2.896214209535097e-05, "loss": 0.3616, "num_input_tokens_seen": 4394304, "step": 13055 }, { "epoch": 10.092735703245749, "grad_norm": 0.8824011087417603, "learning_rate": 2.894549445569364e-05, "loss": 0.7044, "num_input_tokens_seen": 4395904, "step": 13060 }, { "epoch": 10.09659969088099, "grad_norm": 0.8493468761444092, "learning_rate": 2.892884502160181e-05, "loss": 0.4787, "num_input_tokens_seen": 4397376, "step": 13065 }, { "epoch": 10.10046367851623, "grad_norm": 1.8618353605270386, "learning_rate": 2.8912193800647724e-05, "loss": 0.8939, "num_input_tokens_seen": 4398880, "step": 13070 }, { "epoch": 10.104327666151468, "grad_norm": 1.1054136753082275, "learning_rate": 2.889554080040448e-05, "loss": 0.3364, "num_input_tokens_seen": 4400448, "step": 13075 }, { "epoch": 10.108191653786708, "grad_norm": 1.5327320098876953, "learning_rate": 2.887888602844594e-05, "loss": 0.4258, "num_input_tokens_seen": 4402304, "step": 13080 }, { "epoch": 10.112055641421948, "grad_norm": 0.7546743154525757, "learning_rate": 2.8862229492346814e-05, "loss": 0.4728, "num_input_tokens_seen": 4404032, "step": 13085 }, { "epoch": 10.115919629057187, "grad_norm": 0.7657408118247986, "learning_rate": 2.8845571199682574e-05, "loss": 0.437, "num_input_tokens_seen": 4405952, "step": 13090 }, { "epoch": 10.119783616692427, "grad_norm": 0.8069931268692017, "learning_rate": 2.8828911158029535e-05, "loss": 0.454, "num_input_tokens_seen": 4407616, "step": 13095 }, { "epoch": 10.123647604327665, "grad_norm": 0.9319244623184204, "learning_rate": 2.881224937496476e-05, "loss": 0.3863, "num_input_tokens_seen": 4409216, "step": 13100 }, { "epoch": 10.127511591962906, "grad_norm": 1.503044843673706, "learning_rate": 2.8795585858066142e-05, "loss": 0.3787, "num_input_tokens_seen": 4410848, "step": 13105 }, { "epoch": 10.131375579598146, "grad_norm": 1.023878574371338, "learning_rate": 2.877892061491235e-05, "loss": 0.3858, "num_input_tokens_seen": 4412320, "step": 13110 }, { "epoch": 10.135239567233384, "grad_norm": 1.1009070873260498, "learning_rate": 2.876225365308283e-05, "loss": 0.6497, "num_input_tokens_seen": 4414272, "step": 13115 }, { "epoch": 10.139103554868624, "grad_norm": 0.9470617175102234, "learning_rate": 2.8745584980157813e-05, "loss": 0.4541, "num_input_tokens_seen": 4415776, "step": 13120 }, { "epoch": 10.142967542503865, "grad_norm": 0.8690896034240723, "learning_rate": 2.8728914603718315e-05, "loss": 0.3734, "num_input_tokens_seen": 4417440, "step": 13125 }, { "epoch": 10.146831530139103, "grad_norm": 2.426319122314453, "learning_rate": 2.8712242531346127e-05, "loss": 0.5329, "num_input_tokens_seen": 4419232, "step": 13130 }, { "epoch": 10.150695517774343, "grad_norm": 1.0626634359359741, "learning_rate": 2.869556877062381e-05, "loss": 0.4217, "num_input_tokens_seen": 4420960, "step": 13135 }, { "epoch": 10.154559505409583, "grad_norm": 1.9567257165908813, "learning_rate": 2.867889332913467e-05, "loss": 0.4821, "num_input_tokens_seen": 4422752, "step": 13140 }, { "epoch": 10.158423493044822, "grad_norm": 1.1495366096496582, "learning_rate": 2.8662216214462822e-05, "loss": 0.4224, "num_input_tokens_seen": 4424384, "step": 13145 }, { "epoch": 10.162287480680062, "grad_norm": 1.0662990808486938, "learning_rate": 2.8645537434193104e-05, "loss": 0.3745, "num_input_tokens_seen": 4426112, "step": 13150 }, { "epoch": 10.166151468315302, "grad_norm": 0.9811434745788574, "learning_rate": 2.862885699591113e-05, "loss": 0.5469, "num_input_tokens_seen": 4427776, "step": 13155 }, { "epoch": 10.17001545595054, "grad_norm": 1.165123462677002, "learning_rate": 2.861217490720326e-05, "loss": 0.4524, "num_input_tokens_seen": 4429536, "step": 13160 }, { "epoch": 10.173879443585781, "grad_norm": 0.6220985054969788, "learning_rate": 2.8595491175656608e-05, "loss": 0.3758, "num_input_tokens_seen": 4430976, "step": 13165 }, { "epoch": 10.17774343122102, "grad_norm": 0.809310793876648, "learning_rate": 2.8578805808859044e-05, "loss": 0.4603, "num_input_tokens_seen": 4432576, "step": 13170 }, { "epoch": 10.18160741885626, "grad_norm": 0.9519204497337341, "learning_rate": 2.8562118814399158e-05, "loss": 0.3769, "num_input_tokens_seen": 4434432, "step": 13175 }, { "epoch": 10.1854714064915, "grad_norm": 1.8815250396728516, "learning_rate": 2.854543019986631e-05, "loss": 0.3529, "num_input_tokens_seen": 4436384, "step": 13180 }, { "epoch": 10.189335394126738, "grad_norm": 1.0919287204742432, "learning_rate": 2.852873997285057e-05, "loss": 0.4803, "num_input_tokens_seen": 4437984, "step": 13185 }, { "epoch": 10.193199381761978, "grad_norm": 1.0789363384246826, "learning_rate": 2.851204814094276e-05, "loss": 0.4515, "num_input_tokens_seen": 4439808, "step": 13190 }, { "epoch": 10.197063369397219, "grad_norm": 0.6819553971290588, "learning_rate": 2.849535471173442e-05, "loss": 0.3336, "num_input_tokens_seen": 4441408, "step": 13195 }, { "epoch": 10.200927357032457, "grad_norm": 1.2615572214126587, "learning_rate": 2.8478659692817816e-05, "loss": 0.438, "num_input_tokens_seen": 4443168, "step": 13200 }, { "epoch": 10.204791344667697, "grad_norm": 1.0268518924713135, "learning_rate": 2.8461963091785966e-05, "loss": 0.621, "num_input_tokens_seen": 4444800, "step": 13205 }, { "epoch": 10.208655332302937, "grad_norm": 1.3264788389205933, "learning_rate": 2.8445264916232563e-05, "loss": 0.4248, "num_input_tokens_seen": 4446784, "step": 13210 }, { "epoch": 10.212519319938176, "grad_norm": 0.8190250396728516, "learning_rate": 2.8428565173752043e-05, "loss": 0.4167, "num_input_tokens_seen": 4448480, "step": 13215 }, { "epoch": 10.216383307573416, "grad_norm": 0.9790405631065369, "learning_rate": 2.841186387193954e-05, "loss": 0.524, "num_input_tokens_seen": 4449856, "step": 13220 }, { "epoch": 10.220247295208654, "grad_norm": 0.9241940379142761, "learning_rate": 2.839516101839093e-05, "loss": 0.38, "num_input_tokens_seen": 4451552, "step": 13225 }, { "epoch": 10.224111282843895, "grad_norm": 0.6172051429748535, "learning_rate": 2.8378456620702748e-05, "loss": 0.5399, "num_input_tokens_seen": 4453376, "step": 13230 }, { "epoch": 10.227975270479135, "grad_norm": 1.045253872871399, "learning_rate": 2.8361750686472265e-05, "loss": 0.3692, "num_input_tokens_seen": 4454880, "step": 13235 }, { "epoch": 10.231839258114373, "grad_norm": 1.4533714056015015, "learning_rate": 2.8345043223297436e-05, "loss": 0.4162, "num_input_tokens_seen": 4456704, "step": 13240 }, { "epoch": 10.235703245749614, "grad_norm": 0.7570275068283081, "learning_rate": 2.8328334238776915e-05, "loss": 0.4021, "num_input_tokens_seen": 4458304, "step": 13245 }, { "epoch": 10.239567233384854, "grad_norm": 0.9477952718734741, "learning_rate": 2.831162374051005e-05, "loss": 0.4429, "num_input_tokens_seen": 4459648, "step": 13250 }, { "epoch": 10.243431221020092, "grad_norm": 1.0809885263442993, "learning_rate": 2.829491173609688e-05, "loss": 0.4961, "num_input_tokens_seen": 4461408, "step": 13255 }, { "epoch": 10.247295208655332, "grad_norm": 1.350314736366272, "learning_rate": 2.8278198233138115e-05, "loss": 0.4879, "num_input_tokens_seen": 4462944, "step": 13260 }, { "epoch": 10.251159196290573, "grad_norm": 0.9470356702804565, "learning_rate": 2.826148323923516e-05, "loss": 0.65, "num_input_tokens_seen": 4464416, "step": 13265 }, { "epoch": 10.255023183925811, "grad_norm": 0.7882862687110901, "learning_rate": 2.82447667619901e-05, "loss": 0.4391, "num_input_tokens_seen": 4466240, "step": 13270 }, { "epoch": 10.258887171561051, "grad_norm": 0.7559458017349243, "learning_rate": 2.8228048809005687e-05, "loss": 0.6012, "num_input_tokens_seen": 4467744, "step": 13275 }, { "epoch": 10.262751159196291, "grad_norm": 0.964965283870697, "learning_rate": 2.8211329387885333e-05, "loss": 0.4007, "num_input_tokens_seen": 4469344, "step": 13280 }, { "epoch": 10.26661514683153, "grad_norm": 0.8243792057037354, "learning_rate": 2.819460850623315e-05, "loss": 0.5091, "num_input_tokens_seen": 4471232, "step": 13285 }, { "epoch": 10.27047913446677, "grad_norm": 0.9791451692581177, "learning_rate": 2.8177886171653888e-05, "loss": 0.8419, "num_input_tokens_seen": 4472832, "step": 13290 }, { "epoch": 10.274343122102009, "grad_norm": 1.0425466299057007, "learning_rate": 2.8161162391752955e-05, "loss": 0.3708, "num_input_tokens_seen": 4474208, "step": 13295 }, { "epoch": 10.278207109737249, "grad_norm": 0.9712364673614502, "learning_rate": 2.814443717413644e-05, "loss": 0.4422, "num_input_tokens_seen": 4475872, "step": 13300 }, { "epoch": 10.282071097372489, "grad_norm": 0.809223473072052, "learning_rate": 2.8127710526411067e-05, "loss": 0.5067, "num_input_tokens_seen": 4477504, "step": 13305 }, { "epoch": 10.285935085007727, "grad_norm": 1.8828089237213135, "learning_rate": 2.8110982456184213e-05, "loss": 0.4706, "num_input_tokens_seen": 4479328, "step": 13310 }, { "epoch": 10.289799072642968, "grad_norm": 0.7177894711494446, "learning_rate": 2.8094252971063912e-05, "loss": 0.3636, "num_input_tokens_seen": 4481088, "step": 13315 }, { "epoch": 10.293663060278208, "grad_norm": 1.40597403049469, "learning_rate": 2.807752207865883e-05, "loss": 0.5238, "num_input_tokens_seen": 4482688, "step": 13320 }, { "epoch": 10.297527047913446, "grad_norm": 1.1798697710037231, "learning_rate": 2.806078978657827e-05, "loss": 0.642, "num_input_tokens_seen": 4484192, "step": 13325 }, { "epoch": 10.301391035548686, "grad_norm": 1.2105940580368042, "learning_rate": 2.804405610243218e-05, "loss": 0.4466, "num_input_tokens_seen": 4485760, "step": 13330 }, { "epoch": 10.305255023183927, "grad_norm": 0.8379480242729187, "learning_rate": 2.8027321033831156e-05, "loss": 0.4327, "num_input_tokens_seen": 4487520, "step": 13335 }, { "epoch": 10.309119010819165, "grad_norm": 0.8257488012313843, "learning_rate": 2.801058458838639e-05, "loss": 0.4187, "num_input_tokens_seen": 4489248, "step": 13340 }, { "epoch": 10.312982998454405, "grad_norm": 0.7923182845115662, "learning_rate": 2.7993846773709725e-05, "loss": 0.4953, "num_input_tokens_seen": 4490976, "step": 13345 }, { "epoch": 10.316846986089644, "grad_norm": 1.6052963733673096, "learning_rate": 2.7977107597413614e-05, "loss": 0.5409, "num_input_tokens_seen": 4492544, "step": 13350 }, { "epoch": 10.320710973724884, "grad_norm": 0.9052250385284424, "learning_rate": 2.796036706711115e-05, "loss": 0.4659, "num_input_tokens_seen": 4494208, "step": 13355 }, { "epoch": 10.324574961360124, "grad_norm": 0.9165639281272888, "learning_rate": 2.7943625190416005e-05, "loss": 0.3852, "num_input_tokens_seen": 4495840, "step": 13360 }, { "epoch": 10.328438948995363, "grad_norm": 0.6311342716217041, "learning_rate": 2.7926881974942488e-05, "loss": 0.4337, "num_input_tokens_seen": 4497664, "step": 13365 }, { "epoch": 10.332302936630603, "grad_norm": 0.8914085626602173, "learning_rate": 2.7910137428305534e-05, "loss": 0.3516, "num_input_tokens_seen": 4499104, "step": 13370 }, { "epoch": 10.336166924265843, "grad_norm": 0.6751750111579895, "learning_rate": 2.7893391558120646e-05, "loss": 0.6417, "num_input_tokens_seen": 4500960, "step": 13375 }, { "epoch": 10.340030911901081, "grad_norm": 1.1050670146942139, "learning_rate": 2.7876644372003945e-05, "loss": 0.371, "num_input_tokens_seen": 4502528, "step": 13380 }, { "epoch": 10.343894899536322, "grad_norm": 0.7991294264793396, "learning_rate": 2.7859895877572155e-05, "loss": 0.4439, "num_input_tokens_seen": 4504032, "step": 13385 }, { "epoch": 10.347758887171562, "grad_norm": 0.8630145192146301, "learning_rate": 2.78431460824426e-05, "loss": 0.389, "num_input_tokens_seen": 4505472, "step": 13390 }, { "epoch": 10.3516228748068, "grad_norm": 0.8628560304641724, "learning_rate": 2.7826394994233178e-05, "loss": 0.5506, "num_input_tokens_seen": 4507104, "step": 13395 }, { "epoch": 10.35548686244204, "grad_norm": 1.0316472053527832, "learning_rate": 2.780964262056239e-05, "loss": 0.4976, "num_input_tokens_seen": 4508576, "step": 13400 }, { "epoch": 10.35935085007728, "grad_norm": 0.7058213949203491, "learning_rate": 2.7792888969049304e-05, "loss": 0.4621, "num_input_tokens_seen": 4510336, "step": 13405 }, { "epoch": 10.363214837712519, "grad_norm": 0.7799232006072998, "learning_rate": 2.777613404731359e-05, "loss": 0.3998, "num_input_tokens_seen": 4512000, "step": 13410 }, { "epoch": 10.36707882534776, "grad_norm": 1.578855276107788, "learning_rate": 2.7759377862975484e-05, "loss": 0.5428, "num_input_tokens_seen": 4513632, "step": 13415 }, { "epoch": 10.370942812982998, "grad_norm": 1.0289757251739502, "learning_rate": 2.7742620423655806e-05, "loss": 0.3976, "num_input_tokens_seen": 4515200, "step": 13420 }, { "epoch": 10.374806800618238, "grad_norm": 1.1098743677139282, "learning_rate": 2.772586173697593e-05, "loss": 0.4523, "num_input_tokens_seen": 4516768, "step": 13425 }, { "epoch": 10.378670788253478, "grad_norm": 1.0675519704818726, "learning_rate": 2.7709101810557813e-05, "loss": 0.4639, "num_input_tokens_seen": 4518272, "step": 13430 }, { "epoch": 10.382534775888717, "grad_norm": 0.9192325472831726, "learning_rate": 2.769234065202397e-05, "loss": 0.4974, "num_input_tokens_seen": 4520032, "step": 13435 }, { "epoch": 10.386398763523957, "grad_norm": 0.9222530126571655, "learning_rate": 2.7675578268997477e-05, "loss": 0.429, "num_input_tokens_seen": 4521824, "step": 13440 }, { "epoch": 10.390262751159197, "grad_norm": 0.598755955696106, "learning_rate": 2.7658814669101956e-05, "loss": 0.4319, "num_input_tokens_seen": 4523520, "step": 13445 }, { "epoch": 10.394126738794435, "grad_norm": 1.026078701019287, "learning_rate": 2.7642049859961606e-05, "loss": 0.5796, "num_input_tokens_seen": 4525120, "step": 13450 }, { "epoch": 10.397990726429676, "grad_norm": 1.7176274061203003, "learning_rate": 2.7625283849201157e-05, "loss": 0.4551, "num_input_tokens_seen": 4526752, "step": 13455 }, { "epoch": 10.401854714064916, "grad_norm": 0.8417883515357971, "learning_rate": 2.760851664444589e-05, "loss": 0.4121, "num_input_tokens_seen": 4528288, "step": 13460 }, { "epoch": 10.405718701700154, "grad_norm": 1.2021563053131104, "learning_rate": 2.7591748253321632e-05, "loss": 0.468, "num_input_tokens_seen": 4529824, "step": 13465 }, { "epoch": 10.409582689335394, "grad_norm": 0.76055508852005, "learning_rate": 2.7574978683454743e-05, "loss": 0.3312, "num_input_tokens_seen": 4531360, "step": 13470 }, { "epoch": 10.413446676970633, "grad_norm": 1.9223968982696533, "learning_rate": 2.7558207942472136e-05, "loss": 0.4493, "num_input_tokens_seen": 4533056, "step": 13475 }, { "epoch": 10.417310664605873, "grad_norm": 1.140701174736023, "learning_rate": 2.7541436038001234e-05, "loss": 0.3895, "num_input_tokens_seen": 4534688, "step": 13480 }, { "epoch": 10.421174652241113, "grad_norm": 1.0866273641586304, "learning_rate": 2.7524662977669992e-05, "loss": 0.4698, "num_input_tokens_seen": 4536448, "step": 13485 }, { "epoch": 10.425038639876352, "grad_norm": 1.329126238822937, "learning_rate": 2.7507888769106914e-05, "loss": 0.3607, "num_input_tokens_seen": 4538016, "step": 13490 }, { "epoch": 10.428902627511592, "grad_norm": 0.8417384028434753, "learning_rate": 2.7491113419941013e-05, "loss": 0.5798, "num_input_tokens_seen": 4539456, "step": 13495 }, { "epoch": 10.432766615146832, "grad_norm": 1.0280263423919678, "learning_rate": 2.7474336937801798e-05, "loss": 0.4091, "num_input_tokens_seen": 4541408, "step": 13500 }, { "epoch": 10.43663060278207, "grad_norm": 1.0304580926895142, "learning_rate": 2.7457559330319326e-05, "loss": 0.5263, "num_input_tokens_seen": 4543264, "step": 13505 }, { "epoch": 10.44049459041731, "grad_norm": 1.097651720046997, "learning_rate": 2.744078060512416e-05, "loss": 0.4751, "num_input_tokens_seen": 4544608, "step": 13510 }, { "epoch": 10.444358578052551, "grad_norm": 0.8819422721862793, "learning_rate": 2.742400076984736e-05, "loss": 0.4155, "num_input_tokens_seen": 4546624, "step": 13515 }, { "epoch": 10.44822256568779, "grad_norm": 1.0966074466705322, "learning_rate": 2.7407219832120484e-05, "loss": 0.4033, "num_input_tokens_seen": 4548256, "step": 13520 }, { "epoch": 10.45208655332303, "grad_norm": 0.9430744647979736, "learning_rate": 2.7390437799575615e-05, "loss": 0.4323, "num_input_tokens_seen": 4550176, "step": 13525 }, { "epoch": 10.45595054095827, "grad_norm": 0.568055272102356, "learning_rate": 2.7373654679845323e-05, "loss": 0.3536, "num_input_tokens_seen": 4551808, "step": 13530 }, { "epoch": 10.459814528593508, "grad_norm": 0.8206682205200195, "learning_rate": 2.7356870480562662e-05, "loss": 0.4569, "num_input_tokens_seen": 4553472, "step": 13535 }, { "epoch": 10.463678516228748, "grad_norm": 0.6119564175605774, "learning_rate": 2.73400852093612e-05, "loss": 0.3795, "num_input_tokens_seen": 4555200, "step": 13540 }, { "epoch": 10.467542503863987, "grad_norm": 1.239930510520935, "learning_rate": 2.7323298873874958e-05, "loss": 0.5546, "num_input_tokens_seen": 4556896, "step": 13545 }, { "epoch": 10.471406491499227, "grad_norm": 0.7835540771484375, "learning_rate": 2.7306511481738483e-05, "loss": 0.6936, "num_input_tokens_seen": 4558496, "step": 13550 }, { "epoch": 10.475270479134467, "grad_norm": 0.887601375579834, "learning_rate": 2.7289723040586773e-05, "loss": 0.5286, "num_input_tokens_seen": 4560096, "step": 13555 }, { "epoch": 10.479134466769706, "grad_norm": 0.8748440742492676, "learning_rate": 2.7272933558055312e-05, "loss": 0.4081, "num_input_tokens_seen": 4561824, "step": 13560 }, { "epoch": 10.482998454404946, "grad_norm": 0.7764946818351746, "learning_rate": 2.725614304178005e-05, "loss": 0.4249, "num_input_tokens_seen": 4563360, "step": 13565 }, { "epoch": 10.486862442040186, "grad_norm": 0.8802481293678284, "learning_rate": 2.723935149939743e-05, "loss": 0.4982, "num_input_tokens_seen": 4564896, "step": 13570 }, { "epoch": 10.490726429675425, "grad_norm": 1.4044172763824463, "learning_rate": 2.7222558938544328e-05, "loss": 0.3825, "num_input_tokens_seen": 4566304, "step": 13575 }, { "epoch": 10.494590417310665, "grad_norm": 0.8593074679374695, "learning_rate": 2.7205765366858122e-05, "loss": 0.6385, "num_input_tokens_seen": 4568352, "step": 13580 }, { "epoch": 10.498454404945905, "grad_norm": 1.214312195777893, "learning_rate": 2.7188970791976603e-05, "loss": 0.5299, "num_input_tokens_seen": 4569984, "step": 13585 }, { "epoch": 10.502318392581143, "grad_norm": 0.7647075057029724, "learning_rate": 2.7172175221538064e-05, "loss": 0.5056, "num_input_tokens_seen": 4571872, "step": 13590 }, { "epoch": 10.506182380216384, "grad_norm": 1.8695446252822876, "learning_rate": 2.715537866318123e-05, "loss": 0.4208, "num_input_tokens_seen": 4573568, "step": 13595 }, { "epoch": 10.510046367851622, "grad_norm": 0.826322615146637, "learning_rate": 2.7138581124545274e-05, "loss": 0.4214, "num_input_tokens_seen": 4575168, "step": 13600 }, { "epoch": 10.513910355486862, "grad_norm": 1.4702003002166748, "learning_rate": 2.7121782613269807e-05, "loss": 0.3769, "num_input_tokens_seen": 4576800, "step": 13605 }, { "epoch": 10.517774343122102, "grad_norm": 0.7716343402862549, "learning_rate": 2.7104983136994903e-05, "loss": 0.4486, "num_input_tokens_seen": 4578464, "step": 13610 }, { "epoch": 10.521638330757341, "grad_norm": 2.7880361080169678, "learning_rate": 2.7088182703361065e-05, "loss": 0.5262, "num_input_tokens_seen": 4580096, "step": 13615 }, { "epoch": 10.525502318392581, "grad_norm": 0.8386353850364685, "learning_rate": 2.707138132000923e-05, "loss": 0.4571, "num_input_tokens_seen": 4581792, "step": 13620 }, { "epoch": 10.529366306027821, "grad_norm": 0.9938804507255554, "learning_rate": 2.7054578994580754e-05, "loss": 0.3936, "num_input_tokens_seen": 4583264, "step": 13625 }, { "epoch": 10.53323029366306, "grad_norm": 0.9279657006263733, "learning_rate": 2.7037775734717458e-05, "loss": 0.4427, "num_input_tokens_seen": 4584832, "step": 13630 }, { "epoch": 10.5370942812983, "grad_norm": 1.0482957363128662, "learning_rate": 2.7020971548061554e-05, "loss": 0.4618, "num_input_tokens_seen": 4586528, "step": 13635 }, { "epoch": 10.54095826893354, "grad_norm": 0.7838960289955139, "learning_rate": 2.700416644225568e-05, "loss": 0.3618, "num_input_tokens_seen": 4588544, "step": 13640 }, { "epoch": 10.544822256568779, "grad_norm": 1.0727797746658325, "learning_rate": 2.6987360424942903e-05, "loss": 0.4118, "num_input_tokens_seen": 4590240, "step": 13645 }, { "epoch": 10.548686244204019, "grad_norm": 1.0744082927703857, "learning_rate": 2.6970553503766717e-05, "loss": 0.4432, "num_input_tokens_seen": 4592160, "step": 13650 }, { "epoch": 10.552550231839259, "grad_norm": 0.9261411428451538, "learning_rate": 2.695374568637099e-05, "loss": 0.3632, "num_input_tokens_seen": 4593664, "step": 13655 }, { "epoch": 10.556414219474497, "grad_norm": 0.7570378184318542, "learning_rate": 2.6936936980400018e-05, "loss": 0.3912, "num_input_tokens_seen": 4595392, "step": 13660 }, { "epoch": 10.560278207109738, "grad_norm": 0.8827912211418152, "learning_rate": 2.692012739349851e-05, "loss": 0.6113, "num_input_tokens_seen": 4597152, "step": 13665 }, { "epoch": 10.564142194744976, "grad_norm": 2.209629774093628, "learning_rate": 2.6903316933311568e-05, "loss": 0.3879, "num_input_tokens_seen": 4598688, "step": 13670 }, { "epoch": 10.568006182380216, "grad_norm": 1.0395385026931763, "learning_rate": 2.688650560748468e-05, "loss": 0.406, "num_input_tokens_seen": 4600704, "step": 13675 }, { "epoch": 10.571870170015456, "grad_norm": 0.6943600177764893, "learning_rate": 2.6869693423663754e-05, "loss": 0.4227, "num_input_tokens_seen": 4602464, "step": 13680 }, { "epoch": 10.575734157650695, "grad_norm": 0.5817541480064392, "learning_rate": 2.6852880389495057e-05, "loss": 0.5505, "num_input_tokens_seen": 4604064, "step": 13685 }, { "epoch": 10.579598145285935, "grad_norm": 0.6575820446014404, "learning_rate": 2.6836066512625264e-05, "loss": 0.4843, "num_input_tokens_seen": 4605760, "step": 13690 }, { "epoch": 10.583462132921175, "grad_norm": 0.8349799513816833, "learning_rate": 2.6819251800701416e-05, "loss": 0.4639, "num_input_tokens_seen": 4607328, "step": 13695 }, { "epoch": 10.587326120556414, "grad_norm": 1.2653018236160278, "learning_rate": 2.6802436261370967e-05, "loss": 0.4574, "num_input_tokens_seen": 4608832, "step": 13700 }, { "epoch": 10.591190108191654, "grad_norm": 0.6244285106658936, "learning_rate": 2.67856199022817e-05, "loss": 0.4367, "num_input_tokens_seen": 4610720, "step": 13705 }, { "epoch": 10.595054095826894, "grad_norm": 1.1135005950927734, "learning_rate": 2.676880273108181e-05, "loss": 0.562, "num_input_tokens_seen": 4612448, "step": 13710 }, { "epoch": 10.598918083462133, "grad_norm": 0.7733105421066284, "learning_rate": 2.675198475541985e-05, "loss": 0.3632, "num_input_tokens_seen": 4613888, "step": 13715 }, { "epoch": 10.602782071097373, "grad_norm": 1.2327327728271484, "learning_rate": 2.673516598294474e-05, "loss": 0.4443, "num_input_tokens_seen": 4615392, "step": 13720 }, { "epoch": 10.606646058732611, "grad_norm": 1.8511216640472412, "learning_rate": 2.6718346421305735e-05, "loss": 0.687, "num_input_tokens_seen": 4616928, "step": 13725 }, { "epoch": 10.610510046367851, "grad_norm": 0.8770402073860168, "learning_rate": 2.6701526078152484e-05, "loss": 0.4278, "num_input_tokens_seen": 4618784, "step": 13730 }, { "epoch": 10.614374034003092, "grad_norm": 1.1842316389083862, "learning_rate": 2.6684704961134994e-05, "loss": 0.4304, "num_input_tokens_seen": 4620416, "step": 13735 }, { "epoch": 10.61823802163833, "grad_norm": 1.0129398107528687, "learning_rate": 2.6667883077903595e-05, "loss": 0.4037, "num_input_tokens_seen": 4622560, "step": 13740 }, { "epoch": 10.62210200927357, "grad_norm": 0.8006773591041565, "learning_rate": 2.6651060436108977e-05, "loss": 0.4692, "num_input_tokens_seen": 4624384, "step": 13745 }, { "epoch": 10.62596599690881, "grad_norm": 1.2477974891662598, "learning_rate": 2.6634237043402193e-05, "loss": 0.6567, "num_input_tokens_seen": 4625856, "step": 13750 }, { "epoch": 10.629829984544049, "grad_norm": 1.1893991231918335, "learning_rate": 2.6617412907434612e-05, "loss": 0.5265, "num_input_tokens_seen": 4627744, "step": 13755 }, { "epoch": 10.63369397217929, "grad_norm": 1.392235517501831, "learning_rate": 2.6600588035857955e-05, "loss": 0.5597, "num_input_tokens_seen": 4629504, "step": 13760 }, { "epoch": 10.63755795981453, "grad_norm": 0.7663353085517883, "learning_rate": 2.6583762436324266e-05, "loss": 0.395, "num_input_tokens_seen": 4631264, "step": 13765 }, { "epoch": 10.641421947449768, "grad_norm": 0.49899059534072876, "learning_rate": 2.6566936116485946e-05, "loss": 0.3425, "num_input_tokens_seen": 4633152, "step": 13770 }, { "epoch": 10.645285935085008, "grad_norm": 1.1002360582351685, "learning_rate": 2.65501090839957e-05, "loss": 0.4069, "num_input_tokens_seen": 4634720, "step": 13775 }, { "epoch": 10.649149922720248, "grad_norm": 0.927388608455658, "learning_rate": 2.653328134650655e-05, "loss": 0.5231, "num_input_tokens_seen": 4636576, "step": 13780 }, { "epoch": 10.653013910355487, "grad_norm": 0.8334895372390747, "learning_rate": 2.651645291167186e-05, "loss": 0.4745, "num_input_tokens_seen": 4638336, "step": 13785 }, { "epoch": 10.656877897990727, "grad_norm": 0.8341463208198547, "learning_rate": 2.649962378714531e-05, "loss": 0.5101, "num_input_tokens_seen": 4639936, "step": 13790 }, { "epoch": 10.660741885625965, "grad_norm": 0.6773219704627991, "learning_rate": 2.648279398058088e-05, "loss": 0.368, "num_input_tokens_seen": 4641632, "step": 13795 }, { "epoch": 10.664605873261205, "grad_norm": 1.0047956705093384, "learning_rate": 2.6465963499632866e-05, "loss": 0.4409, "num_input_tokens_seen": 4643200, "step": 13800 }, { "epoch": 10.668469860896446, "grad_norm": 1.0524712800979614, "learning_rate": 2.644913235195587e-05, "loss": 0.4306, "num_input_tokens_seen": 4644736, "step": 13805 }, { "epoch": 10.672333848531684, "grad_norm": 1.1518354415893555, "learning_rate": 2.643230054520481e-05, "loss": 0.4943, "num_input_tokens_seen": 4646368, "step": 13810 }, { "epoch": 10.676197836166924, "grad_norm": 1.3230160474777222, "learning_rate": 2.6415468087034872e-05, "loss": 0.5419, "num_input_tokens_seen": 4647936, "step": 13815 }, { "epoch": 10.680061823802165, "grad_norm": 0.99172043800354, "learning_rate": 2.6398634985101582e-05, "loss": 0.4362, "num_input_tokens_seen": 4649504, "step": 13820 }, { "epoch": 10.683925811437403, "grad_norm": 1.7402554750442505, "learning_rate": 2.638180124706072e-05, "loss": 0.773, "num_input_tokens_seen": 4651008, "step": 13825 }, { "epoch": 10.687789799072643, "grad_norm": 1.319742202758789, "learning_rate": 2.6364966880568377e-05, "loss": 0.4743, "num_input_tokens_seen": 4652832, "step": 13830 }, { "epoch": 10.691653786707883, "grad_norm": 0.6542954444885254, "learning_rate": 2.6348131893280927e-05, "loss": 0.3515, "num_input_tokens_seen": 4654368, "step": 13835 }, { "epoch": 10.695517774343122, "grad_norm": 0.8978901505470276, "learning_rate": 2.6331296292855013e-05, "loss": 0.3494, "num_input_tokens_seen": 4655872, "step": 13840 }, { "epoch": 10.699381761978362, "grad_norm": 1.0307217836380005, "learning_rate": 2.6314460086947567e-05, "loss": 0.4321, "num_input_tokens_seen": 4657728, "step": 13845 }, { "epoch": 10.7032457496136, "grad_norm": 0.9102848768234253, "learning_rate": 2.6297623283215806e-05, "loss": 0.5227, "num_input_tokens_seen": 4659328, "step": 13850 }, { "epoch": 10.70710973724884, "grad_norm": 1.6304832696914673, "learning_rate": 2.628078588931721e-05, "loss": 0.4202, "num_input_tokens_seen": 4661152, "step": 13855 }, { "epoch": 10.71097372488408, "grad_norm": 1.0416829586029053, "learning_rate": 2.626394791290952e-05, "loss": 0.4674, "num_input_tokens_seen": 4662784, "step": 13860 }, { "epoch": 10.71483771251932, "grad_norm": 1.2876968383789062, "learning_rate": 2.6247109361650742e-05, "loss": 0.523, "num_input_tokens_seen": 4664512, "step": 13865 }, { "epoch": 10.71870170015456, "grad_norm": 1.022584319114685, "learning_rate": 2.623027024319916e-05, "loss": 0.5306, "num_input_tokens_seen": 4666240, "step": 13870 }, { "epoch": 10.7225656877898, "grad_norm": 0.6331316232681274, "learning_rate": 2.621343056521331e-05, "loss": 0.4648, "num_input_tokens_seen": 4667936, "step": 13875 }, { "epoch": 10.726429675425038, "grad_norm": 0.7416380643844604, "learning_rate": 2.619659033535196e-05, "loss": 0.3539, "num_input_tokens_seen": 4669536, "step": 13880 }, { "epoch": 10.730293663060278, "grad_norm": 0.9958297610282898, "learning_rate": 2.617974956127417e-05, "loss": 0.4607, "num_input_tokens_seen": 4671584, "step": 13885 }, { "epoch": 10.734157650695519, "grad_norm": 1.1042817831039429, "learning_rate": 2.6162908250639212e-05, "loss": 0.4966, "num_input_tokens_seen": 4673184, "step": 13890 }, { "epoch": 10.738021638330757, "grad_norm": 1.2651724815368652, "learning_rate": 2.6146066411106618e-05, "loss": 0.5259, "num_input_tokens_seen": 4674976, "step": 13895 }, { "epoch": 10.741885625965997, "grad_norm": 0.8841461539268494, "learning_rate": 2.6129224050336155e-05, "loss": 0.4597, "num_input_tokens_seen": 4676736, "step": 13900 }, { "epoch": 10.745749613601237, "grad_norm": 0.7271049618721008, "learning_rate": 2.6112381175987828e-05, "loss": 0.4331, "num_input_tokens_seen": 4678656, "step": 13905 }, { "epoch": 10.749613601236476, "grad_norm": 1.3614147901535034, "learning_rate": 2.6095537795721886e-05, "loss": 0.512, "num_input_tokens_seen": 4680352, "step": 13910 }, { "epoch": 10.753477588871716, "grad_norm": 0.8726553320884705, "learning_rate": 2.6078693917198798e-05, "loss": 0.4086, "num_input_tokens_seen": 4682144, "step": 13915 }, { "epoch": 10.757341576506954, "grad_norm": 1.0569381713867188, "learning_rate": 2.6061849548079247e-05, "loss": 0.5042, "num_input_tokens_seen": 4683808, "step": 13920 }, { "epoch": 10.761205564142195, "grad_norm": 0.7914344668388367, "learning_rate": 2.604500469602416e-05, "loss": 0.3738, "num_input_tokens_seen": 4685728, "step": 13925 }, { "epoch": 10.765069551777435, "grad_norm": 0.9364996552467346, "learning_rate": 2.602815936869469e-05, "loss": 0.4524, "num_input_tokens_seen": 4687392, "step": 13930 }, { "epoch": 10.768933539412673, "grad_norm": 0.7942550182342529, "learning_rate": 2.601131357375217e-05, "loss": 0.5249, "num_input_tokens_seen": 4689216, "step": 13935 }, { "epoch": 10.772797527047913, "grad_norm": 0.7515724897384644, "learning_rate": 2.599446731885819e-05, "loss": 0.4005, "num_input_tokens_seen": 4690912, "step": 13940 }, { "epoch": 10.776661514683154, "grad_norm": 1.4350272417068481, "learning_rate": 2.5977620611674514e-05, "loss": 0.5282, "num_input_tokens_seen": 4692544, "step": 13945 }, { "epoch": 10.780525502318392, "grad_norm": 0.8692649006843567, "learning_rate": 2.5960773459863132e-05, "loss": 0.3878, "num_input_tokens_seen": 4694144, "step": 13950 }, { "epoch": 10.784389489953632, "grad_norm": 0.5264361500740051, "learning_rate": 2.5943925871086216e-05, "loss": 0.4519, "num_input_tokens_seen": 4695808, "step": 13955 }, { "epoch": 10.788253477588873, "grad_norm": 1.718780755996704, "learning_rate": 2.5927077853006178e-05, "loss": 0.6745, "num_input_tokens_seen": 4697632, "step": 13960 }, { "epoch": 10.792117465224111, "grad_norm": 0.7258157730102539, "learning_rate": 2.5910229413285563e-05, "loss": 0.5341, "num_input_tokens_seen": 4699296, "step": 13965 }, { "epoch": 10.795981452859351, "grad_norm": 1.0778197050094604, "learning_rate": 2.5893380559587167e-05, "loss": 0.5002, "num_input_tokens_seen": 4700832, "step": 13970 }, { "epoch": 10.79984544049459, "grad_norm": 0.8417460322380066, "learning_rate": 2.5876531299573947e-05, "loss": 0.5108, "num_input_tokens_seen": 4702336, "step": 13975 }, { "epoch": 10.80370942812983, "grad_norm": 0.6332457065582275, "learning_rate": 2.585968164090904e-05, "loss": 0.4433, "num_input_tokens_seen": 4704000, "step": 13980 }, { "epoch": 10.80757341576507, "grad_norm": 1.1692358255386353, "learning_rate": 2.5842831591255768e-05, "loss": 0.6449, "num_input_tokens_seen": 4705824, "step": 13985 }, { "epoch": 10.811437403400308, "grad_norm": 1.750584602355957, "learning_rate": 2.5825981158277645e-05, "loss": 0.4991, "num_input_tokens_seen": 4707168, "step": 13990 }, { "epoch": 10.815301391035549, "grad_norm": 1.2078630924224854, "learning_rate": 2.580913034963835e-05, "loss": 0.441, "num_input_tokens_seen": 4708704, "step": 13995 }, { "epoch": 10.819165378670789, "grad_norm": 0.8126177787780762, "learning_rate": 2.5792279173001722e-05, "loss": 0.3839, "num_input_tokens_seen": 4710400, "step": 14000 }, { "epoch": 10.823029366306027, "grad_norm": 1.0444247722625732, "learning_rate": 2.5775427636031773e-05, "loss": 0.3511, "num_input_tokens_seen": 4712128, "step": 14005 }, { "epoch": 10.826893353941268, "grad_norm": 0.7350478768348694, "learning_rate": 2.57585757463927e-05, "loss": 0.3266, "num_input_tokens_seen": 4713728, "step": 14010 }, { "epoch": 10.830757341576508, "grad_norm": 0.7336398363113403, "learning_rate": 2.5741723511748837e-05, "loss": 0.4996, "num_input_tokens_seen": 4715232, "step": 14015 }, { "epoch": 10.834621329211746, "grad_norm": 0.5927093625068665, "learning_rate": 2.5724870939764674e-05, "loss": 0.349, "num_input_tokens_seen": 4716960, "step": 14020 }, { "epoch": 10.838485316846986, "grad_norm": 1.0753463506698608, "learning_rate": 2.5708018038104862e-05, "loss": 0.4085, "num_input_tokens_seen": 4718464, "step": 14025 }, { "epoch": 10.842349304482227, "grad_norm": 1.4154770374298096, "learning_rate": 2.5691164814434214e-05, "loss": 0.533, "num_input_tokens_seen": 4720256, "step": 14030 }, { "epoch": 10.846213292117465, "grad_norm": 2.0912022590637207, "learning_rate": 2.5674311276417672e-05, "loss": 0.6615, "num_input_tokens_seen": 4722112, "step": 14035 }, { "epoch": 10.850077279752705, "grad_norm": 1.6564749479293823, "learning_rate": 2.5657457431720315e-05, "loss": 0.4793, "num_input_tokens_seen": 4723680, "step": 14040 }, { "epoch": 10.853941267387944, "grad_norm": 0.8886836767196655, "learning_rate": 2.5640603288007385e-05, "loss": 0.3938, "num_input_tokens_seen": 4725120, "step": 14045 }, { "epoch": 10.857805255023184, "grad_norm": 0.9410741925239563, "learning_rate": 2.5623748852944246e-05, "loss": 0.5357, "num_input_tokens_seen": 4726976, "step": 14050 }, { "epoch": 10.861669242658424, "grad_norm": 0.9257506728172302, "learning_rate": 2.5606894134196386e-05, "loss": 0.4468, "num_input_tokens_seen": 4729024, "step": 14055 }, { "epoch": 10.865533230293662, "grad_norm": 1.4049268960952759, "learning_rate": 2.5590039139429444e-05, "loss": 0.5226, "num_input_tokens_seen": 4730656, "step": 14060 }, { "epoch": 10.869397217928903, "grad_norm": 0.619108259677887, "learning_rate": 2.5573183876309165e-05, "loss": 0.4221, "num_input_tokens_seen": 4732384, "step": 14065 }, { "epoch": 10.873261205564143, "grad_norm": 1.2156705856323242, "learning_rate": 2.555632835250143e-05, "loss": 0.6153, "num_input_tokens_seen": 4733952, "step": 14070 }, { "epoch": 10.877125193199381, "grad_norm": 0.8942641615867615, "learning_rate": 2.5539472575672226e-05, "loss": 0.3963, "num_input_tokens_seen": 4735840, "step": 14075 }, { "epoch": 10.880989180834622, "grad_norm": 1.3690348863601685, "learning_rate": 2.5522616553487664e-05, "loss": 0.5937, "num_input_tokens_seen": 4737440, "step": 14080 }, { "epoch": 10.884853168469862, "grad_norm": 0.9199308156967163, "learning_rate": 2.5505760293613962e-05, "loss": 0.324, "num_input_tokens_seen": 4739136, "step": 14085 }, { "epoch": 10.8887171561051, "grad_norm": 1.1613599061965942, "learning_rate": 2.548890380371745e-05, "loss": 0.4189, "num_input_tokens_seen": 4741056, "step": 14090 }, { "epoch": 10.89258114374034, "grad_norm": 0.8919600248336792, "learning_rate": 2.5472047091464564e-05, "loss": 0.4203, "num_input_tokens_seen": 4742720, "step": 14095 }, { "epoch": 10.896445131375579, "grad_norm": 1.525112271308899, "learning_rate": 2.5455190164521838e-05, "loss": 0.3602, "num_input_tokens_seen": 4744352, "step": 14100 }, { "epoch": 10.900309119010819, "grad_norm": 0.7532141208648682, "learning_rate": 2.5438333030555887e-05, "loss": 0.3598, "num_input_tokens_seen": 4746176, "step": 14105 }, { "epoch": 10.90417310664606, "grad_norm": 1.3379075527191162, "learning_rate": 2.5421475697233455e-05, "loss": 0.4179, "num_input_tokens_seen": 4747680, "step": 14110 }, { "epoch": 10.908037094281298, "grad_norm": 0.8087301254272461, "learning_rate": 2.540461817222135e-05, "loss": 0.4367, "num_input_tokens_seen": 4749408, "step": 14115 }, { "epoch": 10.911901081916538, "grad_norm": 0.7250721454620361, "learning_rate": 2.5387760463186484e-05, "loss": 0.4899, "num_input_tokens_seen": 4751296, "step": 14120 }, { "epoch": 10.915765069551778, "grad_norm": 0.9003886580467224, "learning_rate": 2.5370902577795817e-05, "loss": 0.5399, "num_input_tokens_seen": 4752928, "step": 14125 }, { "epoch": 10.919629057187016, "grad_norm": 0.6646820902824402, "learning_rate": 2.5354044523716458e-05, "loss": 0.3049, "num_input_tokens_seen": 4754368, "step": 14130 }, { "epoch": 10.923493044822257, "grad_norm": 0.9507137537002563, "learning_rate": 2.5337186308615523e-05, "loss": 0.4139, "num_input_tokens_seen": 4756320, "step": 14135 }, { "epoch": 10.927357032457497, "grad_norm": 1.4788686037063599, "learning_rate": 2.532032794016023e-05, "loss": 0.5208, "num_input_tokens_seen": 4758304, "step": 14140 }, { "epoch": 10.931221020092735, "grad_norm": 1.0725806951522827, "learning_rate": 2.5303469426017878e-05, "loss": 0.4169, "num_input_tokens_seen": 4759936, "step": 14145 }, { "epoch": 10.935085007727976, "grad_norm": 1.036256194114685, "learning_rate": 2.5286610773855813e-05, "loss": 0.4754, "num_input_tokens_seen": 4761568, "step": 14150 }, { "epoch": 10.938948995363216, "grad_norm": 0.8593799471855164, "learning_rate": 2.5269751991341455e-05, "loss": 0.4728, "num_input_tokens_seen": 4763424, "step": 14155 }, { "epoch": 10.942812982998454, "grad_norm": 1.2340847253799438, "learning_rate": 2.5252893086142266e-05, "loss": 0.4653, "num_input_tokens_seen": 4765280, "step": 14160 }, { "epoch": 10.946676970633694, "grad_norm": 1.4053109884262085, "learning_rate": 2.523603406592579e-05, "loss": 0.4296, "num_input_tokens_seen": 4767200, "step": 14165 }, { "epoch": 10.950540958268933, "grad_norm": 1.282455563545227, "learning_rate": 2.5219174938359612e-05, "loss": 0.7193, "num_input_tokens_seen": 4769152, "step": 14170 }, { "epoch": 10.954404945904173, "grad_norm": 1.1505323648452759, "learning_rate": 2.5202315711111358e-05, "loss": 0.4073, "num_input_tokens_seen": 4770656, "step": 14175 }, { "epoch": 10.958268933539413, "grad_norm": 0.6761283874511719, "learning_rate": 2.5185456391848705e-05, "loss": 0.4142, "num_input_tokens_seen": 4772064, "step": 14180 }, { "epoch": 10.962132921174652, "grad_norm": 0.7410753965377808, "learning_rate": 2.5168596988239374e-05, "loss": 0.4055, "num_input_tokens_seen": 4773856, "step": 14185 }, { "epoch": 10.965996908809892, "grad_norm": 0.8043281435966492, "learning_rate": 2.5151737507951123e-05, "loss": 0.5262, "num_input_tokens_seen": 4775424, "step": 14190 }, { "epoch": 10.969860896445132, "grad_norm": 1.3315805196762085, "learning_rate": 2.5134877958651747e-05, "loss": 0.4967, "num_input_tokens_seen": 4777056, "step": 14195 }, { "epoch": 10.97372488408037, "grad_norm": 0.8204246759414673, "learning_rate": 2.511801834800907e-05, "loss": 0.4115, "num_input_tokens_seen": 4778656, "step": 14200 }, { "epoch": 10.97758887171561, "grad_norm": 1.2345786094665527, "learning_rate": 2.5101158683690935e-05, "loss": 0.8207, "num_input_tokens_seen": 4780672, "step": 14205 }, { "epoch": 10.98145285935085, "grad_norm": 0.9122880697250366, "learning_rate": 2.5084298973365222e-05, "loss": 0.3742, "num_input_tokens_seen": 4782400, "step": 14210 }, { "epoch": 10.98531684698609, "grad_norm": 1.6156796216964722, "learning_rate": 2.506743922469984e-05, "loss": 0.5903, "num_input_tokens_seen": 4784192, "step": 14215 }, { "epoch": 10.98918083462133, "grad_norm": 1.0297225713729858, "learning_rate": 2.5050579445362693e-05, "loss": 0.6345, "num_input_tokens_seen": 4786080, "step": 14220 }, { "epoch": 10.993044822256568, "grad_norm": 0.7487525343894958, "learning_rate": 2.5033719643021707e-05, "loss": 0.3297, "num_input_tokens_seen": 4787872, "step": 14225 }, { "epoch": 10.996908809891808, "grad_norm": 0.9403852224349976, "learning_rate": 2.501685982534483e-05, "loss": 0.62, "num_input_tokens_seen": 4789376, "step": 14230 }, { "epoch": 11.0, "eval_loss": 0.4719402492046356, "eval_runtime": 6.3655, "eval_samples_per_second": 90.331, "eval_steps_per_second": 22.622, "num_input_tokens_seen": 4790336, "step": 14234 }, { "epoch": 11.000772797527048, "grad_norm": 1.4464969635009766, "learning_rate": 2.5e-05, "loss": 0.5641, "num_input_tokens_seen": 4790624, "step": 14235 }, { "epoch": 11.004636785162287, "grad_norm": 0.8142437934875488, "learning_rate": 2.498314017465518e-05, "loss": 0.5029, "num_input_tokens_seen": 4792448, "step": 14240 }, { "epoch": 11.008500772797527, "grad_norm": 0.7527857422828674, "learning_rate": 2.4966280356978296e-05, "loss": 0.4869, "num_input_tokens_seen": 4794048, "step": 14245 }, { "epoch": 11.012364760432767, "grad_norm": 1.4220138788223267, "learning_rate": 2.4949420554637316e-05, "loss": 0.4536, "num_input_tokens_seen": 4795872, "step": 14250 }, { "epoch": 11.016228748068006, "grad_norm": 1.028754472732544, "learning_rate": 2.493256077530017e-05, "loss": 0.3858, "num_input_tokens_seen": 4797536, "step": 14255 }, { "epoch": 11.020092735703246, "grad_norm": 0.745125412940979, "learning_rate": 2.4915701026634777e-05, "loss": 0.4161, "num_input_tokens_seen": 4799616, "step": 14260 }, { "epoch": 11.023956723338486, "grad_norm": 1.286819577217102, "learning_rate": 2.4898841316309067e-05, "loss": 0.3902, "num_input_tokens_seen": 4801216, "step": 14265 }, { "epoch": 11.027820710973725, "grad_norm": 1.452097773551941, "learning_rate": 2.4881981651990937e-05, "loss": 0.6466, "num_input_tokens_seen": 4802816, "step": 14270 }, { "epoch": 11.031684698608965, "grad_norm": 0.9256743788719177, "learning_rate": 2.486512204134826e-05, "loss": 0.4706, "num_input_tokens_seen": 4804416, "step": 14275 }, { "epoch": 11.035548686244203, "grad_norm": 1.4206055402755737, "learning_rate": 2.484826249204888e-05, "loss": 0.5933, "num_input_tokens_seen": 4805856, "step": 14280 }, { "epoch": 11.039412673879443, "grad_norm": 0.7403125166893005, "learning_rate": 2.4831403011760635e-05, "loss": 0.4145, "num_input_tokens_seen": 4807520, "step": 14285 }, { "epoch": 11.043276661514684, "grad_norm": 0.8413897752761841, "learning_rate": 2.4814543608151305e-05, "loss": 0.5725, "num_input_tokens_seen": 4809376, "step": 14290 }, { "epoch": 11.047140649149922, "grad_norm": 1.5181328058242798, "learning_rate": 2.479768428888865e-05, "loss": 0.4302, "num_input_tokens_seen": 4811008, "step": 14295 }, { "epoch": 11.051004636785162, "grad_norm": 0.7026469707489014, "learning_rate": 2.4780825061640387e-05, "loss": 0.4393, "num_input_tokens_seen": 4812480, "step": 14300 }, { "epoch": 11.054868624420402, "grad_norm": 1.123587965965271, "learning_rate": 2.476396593407421e-05, "loss": 0.36, "num_input_tokens_seen": 4813888, "step": 14305 }, { "epoch": 11.05873261205564, "grad_norm": 1.0688706636428833, "learning_rate": 2.4747106913857737e-05, "loss": 0.5677, "num_input_tokens_seen": 4815680, "step": 14310 }, { "epoch": 11.062596599690881, "grad_norm": 0.7342590689659119, "learning_rate": 2.473024800865855e-05, "loss": 0.3214, "num_input_tokens_seen": 4817248, "step": 14315 }, { "epoch": 11.066460587326121, "grad_norm": 0.8939720392227173, "learning_rate": 2.4713389226144193e-05, "loss": 0.4696, "num_input_tokens_seen": 4819200, "step": 14320 }, { "epoch": 11.07032457496136, "grad_norm": 0.6455073356628418, "learning_rate": 2.469653057398213e-05, "loss": 0.4469, "num_input_tokens_seen": 4820928, "step": 14325 }, { "epoch": 11.0741885625966, "grad_norm": 1.5656157732009888, "learning_rate": 2.4679672059839774e-05, "loss": 0.5836, "num_input_tokens_seen": 4822752, "step": 14330 }, { "epoch": 11.07805255023184, "grad_norm": 0.7491979002952576, "learning_rate": 2.4662813691384486e-05, "loss": 0.3981, "num_input_tokens_seen": 4824544, "step": 14335 }, { "epoch": 11.081916537867079, "grad_norm": 1.0844913721084595, "learning_rate": 2.464595547628354e-05, "loss": 0.6792, "num_input_tokens_seen": 4826752, "step": 14340 }, { "epoch": 11.085780525502319, "grad_norm": 0.7180913090705872, "learning_rate": 2.462909742220418e-05, "loss": 0.3229, "num_input_tokens_seen": 4828576, "step": 14345 }, { "epoch": 11.089644513137557, "grad_norm": 0.6744866967201233, "learning_rate": 2.461223953681352e-05, "loss": 0.3496, "num_input_tokens_seen": 4830208, "step": 14350 }, { "epoch": 11.093508500772797, "grad_norm": 1.1609389781951904, "learning_rate": 2.4595381827778655e-05, "loss": 0.394, "num_input_tokens_seen": 4831936, "step": 14355 }, { "epoch": 11.097372488408038, "grad_norm": 1.57647705078125, "learning_rate": 2.4578524302766554e-05, "loss": 0.3639, "num_input_tokens_seen": 4833472, "step": 14360 }, { "epoch": 11.101236476043276, "grad_norm": 1.1344690322875977, "learning_rate": 2.456166696944412e-05, "loss": 0.4352, "num_input_tokens_seen": 4835104, "step": 14365 }, { "epoch": 11.105100463678516, "grad_norm": 0.6424022316932678, "learning_rate": 2.4544809835478175e-05, "loss": 0.4102, "num_input_tokens_seen": 4836864, "step": 14370 }, { "epoch": 11.108964451313756, "grad_norm": 0.9967986941337585, "learning_rate": 2.4527952908535445e-05, "loss": 0.3959, "num_input_tokens_seen": 4838368, "step": 14375 }, { "epoch": 11.112828438948995, "grad_norm": 0.7400710582733154, "learning_rate": 2.4511096196282547e-05, "loss": 0.3731, "num_input_tokens_seen": 4840064, "step": 14380 }, { "epoch": 11.116692426584235, "grad_norm": 0.9929935932159424, "learning_rate": 2.4494239706386037e-05, "loss": 0.4876, "num_input_tokens_seen": 4841760, "step": 14385 }, { "epoch": 11.120556414219475, "grad_norm": 1.2112082242965698, "learning_rate": 2.4477383446512338e-05, "loss": 0.4449, "num_input_tokens_seen": 4843296, "step": 14390 }, { "epoch": 11.124420401854714, "grad_norm": 1.3038556575775146, "learning_rate": 2.446052742432778e-05, "loss": 0.3476, "num_input_tokens_seen": 4844928, "step": 14395 }, { "epoch": 11.128284389489954, "grad_norm": 1.0617964267730713, "learning_rate": 2.4443671647498577e-05, "loss": 0.4423, "num_input_tokens_seen": 4846400, "step": 14400 }, { "epoch": 11.132148377125192, "grad_norm": 0.9343466758728027, "learning_rate": 2.442681612369084e-05, "loss": 0.5481, "num_input_tokens_seen": 4848000, "step": 14405 }, { "epoch": 11.136012364760433, "grad_norm": 1.3603110313415527, "learning_rate": 2.4409960860570566e-05, "loss": 0.5677, "num_input_tokens_seen": 4849568, "step": 14410 }, { "epoch": 11.139876352395673, "grad_norm": 0.9731127619743347, "learning_rate": 2.439310586580362e-05, "loss": 0.4424, "num_input_tokens_seen": 4851232, "step": 14415 }, { "epoch": 11.143740340030911, "grad_norm": 1.3826621770858765, "learning_rate": 2.4376251147055757e-05, "loss": 0.5513, "num_input_tokens_seen": 4852864, "step": 14420 }, { "epoch": 11.147604327666151, "grad_norm": 0.8459819555282593, "learning_rate": 2.4359396711992617e-05, "loss": 0.479, "num_input_tokens_seen": 4854848, "step": 14425 }, { "epoch": 11.151468315301392, "grad_norm": 0.8843247890472412, "learning_rate": 2.4342542568279687e-05, "loss": 0.3882, "num_input_tokens_seen": 4856608, "step": 14430 }, { "epoch": 11.15533230293663, "grad_norm": 1.3857221603393555, "learning_rate": 2.432568872358233e-05, "loss": 0.5078, "num_input_tokens_seen": 4858144, "step": 14435 }, { "epoch": 11.15919629057187, "grad_norm": 1.4743521213531494, "learning_rate": 2.430883518556579e-05, "loss": 0.5694, "num_input_tokens_seen": 4860000, "step": 14440 }, { "epoch": 11.16306027820711, "grad_norm": 1.2740576267242432, "learning_rate": 2.429198196189514e-05, "loss": 0.4019, "num_input_tokens_seen": 4861632, "step": 14445 }, { "epoch": 11.166924265842349, "grad_norm": 0.7476908564567566, "learning_rate": 2.4275129060235332e-05, "loss": 0.4947, "num_input_tokens_seen": 4863136, "step": 14450 }, { "epoch": 11.170788253477589, "grad_norm": 1.2664434909820557, "learning_rate": 2.4258276488251172e-05, "loss": 0.4651, "num_input_tokens_seen": 4864576, "step": 14455 }, { "epoch": 11.17465224111283, "grad_norm": 0.7139345407485962, "learning_rate": 2.42414242536073e-05, "loss": 0.42, "num_input_tokens_seen": 4866336, "step": 14460 }, { "epoch": 11.178516228748068, "grad_norm": 1.512848138809204, "learning_rate": 2.422457236396823e-05, "loss": 0.4927, "num_input_tokens_seen": 4868192, "step": 14465 }, { "epoch": 11.182380216383308, "grad_norm": 1.5567530393600464, "learning_rate": 2.4207720826998284e-05, "loss": 0.5857, "num_input_tokens_seen": 4869856, "step": 14470 }, { "epoch": 11.186244204018546, "grad_norm": 0.7637233138084412, "learning_rate": 2.419086965036166e-05, "loss": 0.4296, "num_input_tokens_seen": 4871488, "step": 14475 }, { "epoch": 11.190108191653787, "grad_norm": 1.3058112859725952, "learning_rate": 2.417401884172236e-05, "loss": 0.3473, "num_input_tokens_seen": 4873248, "step": 14480 }, { "epoch": 11.193972179289027, "grad_norm": 0.738941490650177, "learning_rate": 2.4157168408744235e-05, "loss": 0.5337, "num_input_tokens_seen": 4875008, "step": 14485 }, { "epoch": 11.197836166924265, "grad_norm": 1.0089188814163208, "learning_rate": 2.414031835909097e-05, "loss": 0.4751, "num_input_tokens_seen": 4876448, "step": 14490 }, { "epoch": 11.201700154559505, "grad_norm": 0.6770334839820862, "learning_rate": 2.4123468700426065e-05, "loss": 0.3528, "num_input_tokens_seen": 4878144, "step": 14495 }, { "epoch": 11.205564142194746, "grad_norm": 1.333146095275879, "learning_rate": 2.4106619440412835e-05, "loss": 0.3845, "num_input_tokens_seen": 4879872, "step": 14500 }, { "epoch": 11.209428129829984, "grad_norm": 1.2473833560943604, "learning_rate": 2.4089770586714436e-05, "loss": 0.5253, "num_input_tokens_seen": 4881504, "step": 14505 }, { "epoch": 11.213292117465224, "grad_norm": 1.2544723749160767, "learning_rate": 2.407292214699383e-05, "loss": 0.4433, "num_input_tokens_seen": 4883104, "step": 14510 }, { "epoch": 11.217156105100464, "grad_norm": 0.7949316501617432, "learning_rate": 2.4056074128913787e-05, "loss": 0.3503, "num_input_tokens_seen": 4884704, "step": 14515 }, { "epoch": 11.221020092735703, "grad_norm": 0.6993240118026733, "learning_rate": 2.4039226540136874e-05, "loss": 0.5632, "num_input_tokens_seen": 4886400, "step": 14520 }, { "epoch": 11.224884080370943, "grad_norm": 0.7649931311607361, "learning_rate": 2.4022379388325495e-05, "loss": 0.4722, "num_input_tokens_seen": 4888128, "step": 14525 }, { "epoch": 11.228748068006182, "grad_norm": 0.9776847958564758, "learning_rate": 2.4005532681141822e-05, "loss": 0.4512, "num_input_tokens_seen": 4889696, "step": 14530 }, { "epoch": 11.232612055641422, "grad_norm": 1.1407370567321777, "learning_rate": 2.3988686426247834e-05, "loss": 0.5276, "num_input_tokens_seen": 4891488, "step": 14535 }, { "epoch": 11.236476043276662, "grad_norm": 1.8374096155166626, "learning_rate": 2.3971840631305317e-05, "loss": 0.4033, "num_input_tokens_seen": 4893024, "step": 14540 }, { "epoch": 11.2403400309119, "grad_norm": 1.3453986644744873, "learning_rate": 2.395499530397584e-05, "loss": 0.5485, "num_input_tokens_seen": 4894688, "step": 14545 }, { "epoch": 11.24420401854714, "grad_norm": 0.9869358539581299, "learning_rate": 2.393815045192076e-05, "loss": 0.4102, "num_input_tokens_seen": 4896512, "step": 14550 }, { "epoch": 11.24806800618238, "grad_norm": 0.8282604813575745, "learning_rate": 2.3921306082801208e-05, "loss": 0.5331, "num_input_tokens_seen": 4898144, "step": 14555 }, { "epoch": 11.25193199381762, "grad_norm": 0.9575820565223694, "learning_rate": 2.3904462204278117e-05, "loss": 0.3944, "num_input_tokens_seen": 4899872, "step": 14560 }, { "epoch": 11.25579598145286, "grad_norm": 1.8896671533584595, "learning_rate": 2.3887618824012175e-05, "loss": 0.6429, "num_input_tokens_seen": 4901472, "step": 14565 }, { "epoch": 11.2596599690881, "grad_norm": 0.7567316293716431, "learning_rate": 2.387077594966385e-05, "loss": 0.3632, "num_input_tokens_seen": 4903232, "step": 14570 }, { "epoch": 11.263523956723338, "grad_norm": 0.8838436007499695, "learning_rate": 2.385393358889339e-05, "loss": 0.4934, "num_input_tokens_seen": 4904608, "step": 14575 }, { "epoch": 11.267387944358578, "grad_norm": 0.8292526006698608, "learning_rate": 2.3837091749360787e-05, "loss": 0.4327, "num_input_tokens_seen": 4906432, "step": 14580 }, { "epoch": 11.271251931993818, "grad_norm": 0.8270739912986755, "learning_rate": 2.3820250438725834e-05, "loss": 0.5923, "num_input_tokens_seen": 4907968, "step": 14585 }, { "epoch": 11.275115919629057, "grad_norm": 1.5372583866119385, "learning_rate": 2.3803409664648042e-05, "loss": 0.3885, "num_input_tokens_seen": 4909728, "step": 14590 }, { "epoch": 11.278979907264297, "grad_norm": 0.9412124156951904, "learning_rate": 2.3786569434786696e-05, "loss": 0.5295, "num_input_tokens_seen": 4911168, "step": 14595 }, { "epoch": 11.282843894899536, "grad_norm": 1.6219040155410767, "learning_rate": 2.3769729756800845e-05, "loss": 0.3797, "num_input_tokens_seen": 4912928, "step": 14600 }, { "epoch": 11.286707882534776, "grad_norm": 1.5818686485290527, "learning_rate": 2.375289063834926e-05, "loss": 0.5001, "num_input_tokens_seen": 4914560, "step": 14605 }, { "epoch": 11.290571870170016, "grad_norm": 2.0374481678009033, "learning_rate": 2.3736052087090494e-05, "loss": 0.8222, "num_input_tokens_seen": 4916512, "step": 14610 }, { "epoch": 11.294435857805254, "grad_norm": 1.123658537864685, "learning_rate": 2.37192141106828e-05, "loss": 0.5597, "num_input_tokens_seen": 4918176, "step": 14615 }, { "epoch": 11.298299845440495, "grad_norm": 0.7640050649642944, "learning_rate": 2.3702376716784196e-05, "loss": 0.3639, "num_input_tokens_seen": 4919872, "step": 14620 }, { "epoch": 11.302163833075735, "grad_norm": 0.9151740074157715, "learning_rate": 2.368553991305244e-05, "loss": 0.3826, "num_input_tokens_seen": 4921632, "step": 14625 }, { "epoch": 11.306027820710973, "grad_norm": 0.7223367094993591, "learning_rate": 2.3668703707144993e-05, "loss": 0.595, "num_input_tokens_seen": 4923136, "step": 14630 }, { "epoch": 11.309891808346213, "grad_norm": 0.6294185519218445, "learning_rate": 2.3651868106719082e-05, "loss": 0.4757, "num_input_tokens_seen": 4924800, "step": 14635 }, { "epoch": 11.313755795981454, "grad_norm": 0.8023383021354675, "learning_rate": 2.363503311943163e-05, "loss": 0.4542, "num_input_tokens_seen": 4926592, "step": 14640 }, { "epoch": 11.317619783616692, "grad_norm": 1.7900207042694092, "learning_rate": 2.3618198752939284e-05, "loss": 0.4337, "num_input_tokens_seen": 4928640, "step": 14645 }, { "epoch": 11.321483771251932, "grad_norm": 1.0630656480789185, "learning_rate": 2.3601365014898427e-05, "loss": 0.3519, "num_input_tokens_seen": 4930368, "step": 14650 }, { "epoch": 11.32534775888717, "grad_norm": 0.907017707824707, "learning_rate": 2.358453191296513e-05, "loss": 0.4593, "num_input_tokens_seen": 4932160, "step": 14655 }, { "epoch": 11.329211746522411, "grad_norm": 1.0710612535476685, "learning_rate": 2.3567699454795197e-05, "loss": 0.4332, "num_input_tokens_seen": 4934112, "step": 14660 }, { "epoch": 11.333075734157651, "grad_norm": 0.8546087741851807, "learning_rate": 2.3550867648044127e-05, "loss": 0.5084, "num_input_tokens_seen": 4935904, "step": 14665 }, { "epoch": 11.33693972179289, "grad_norm": 0.7404098510742188, "learning_rate": 2.353403650036714e-05, "loss": 0.301, "num_input_tokens_seen": 4937824, "step": 14670 }, { "epoch": 11.34080370942813, "grad_norm": 0.756598711013794, "learning_rate": 2.351720601941913e-05, "loss": 0.4482, "num_input_tokens_seen": 4939712, "step": 14675 }, { "epoch": 11.34466769706337, "grad_norm": 1.4298646450042725, "learning_rate": 2.3500376212854694e-05, "loss": 0.6664, "num_input_tokens_seen": 4941632, "step": 14680 }, { "epoch": 11.348531684698608, "grad_norm": 1.1414834260940552, "learning_rate": 2.3483547088328143e-05, "loss": 0.4545, "num_input_tokens_seen": 4943520, "step": 14685 }, { "epoch": 11.352395672333849, "grad_norm": 1.1642295122146606, "learning_rate": 2.3466718653493464e-05, "loss": 0.4754, "num_input_tokens_seen": 4945472, "step": 14690 }, { "epoch": 11.356259659969089, "grad_norm": 0.6378620862960815, "learning_rate": 2.3449890916004312e-05, "loss": 0.3535, "num_input_tokens_seen": 4947072, "step": 14695 }, { "epoch": 11.360123647604327, "grad_norm": 1.1927909851074219, "learning_rate": 2.343306388351405e-05, "loss": 0.4174, "num_input_tokens_seen": 4948640, "step": 14700 }, { "epoch": 11.363987635239567, "grad_norm": 0.6777770519256592, "learning_rate": 2.341623756367573e-05, "loss": 0.5861, "num_input_tokens_seen": 4950336, "step": 14705 }, { "epoch": 11.367851622874808, "grad_norm": 0.9547669291496277, "learning_rate": 2.3399411964142054e-05, "loss": 0.4436, "num_input_tokens_seen": 4951808, "step": 14710 }, { "epoch": 11.371715610510046, "grad_norm": 0.7562694549560547, "learning_rate": 2.3382587092565393e-05, "loss": 0.4622, "num_input_tokens_seen": 4953472, "step": 14715 }, { "epoch": 11.375579598145286, "grad_norm": 1.0017151832580566, "learning_rate": 2.3365762956597813e-05, "loss": 0.4603, "num_input_tokens_seen": 4955232, "step": 14720 }, { "epoch": 11.379443585780525, "grad_norm": 1.2523707151412964, "learning_rate": 2.3348939563891032e-05, "loss": 0.4292, "num_input_tokens_seen": 4956928, "step": 14725 }, { "epoch": 11.383307573415765, "grad_norm": 0.9085646271705627, "learning_rate": 2.3332116922096414e-05, "loss": 0.3802, "num_input_tokens_seen": 4958304, "step": 14730 }, { "epoch": 11.387171561051005, "grad_norm": 1.312192678451538, "learning_rate": 2.331529503886502e-05, "loss": 0.4932, "num_input_tokens_seen": 4960096, "step": 14735 }, { "epoch": 11.391035548686244, "grad_norm": 1.00066077709198, "learning_rate": 2.3298473921847512e-05, "loss": 0.4747, "num_input_tokens_seen": 4961632, "step": 14740 }, { "epoch": 11.394899536321484, "grad_norm": 0.871157169342041, "learning_rate": 2.3281653578694274e-05, "loss": 0.4052, "num_input_tokens_seen": 4963392, "step": 14745 }, { "epoch": 11.398763523956724, "grad_norm": 1.5719122886657715, "learning_rate": 2.326483401705527e-05, "loss": 0.5003, "num_input_tokens_seen": 4964768, "step": 14750 }, { "epoch": 11.402627511591962, "grad_norm": 1.624190330505371, "learning_rate": 2.3248015244580153e-05, "loss": 0.4435, "num_input_tokens_seen": 4966496, "step": 14755 }, { "epoch": 11.406491499227203, "grad_norm": 0.982480525970459, "learning_rate": 2.3231197268918192e-05, "loss": 0.4216, "num_input_tokens_seen": 4968032, "step": 14760 }, { "epoch": 11.410355486862443, "grad_norm": 0.9022105932235718, "learning_rate": 2.3214380097718306e-05, "loss": 0.4566, "num_input_tokens_seen": 4969792, "step": 14765 }, { "epoch": 11.414219474497681, "grad_norm": 1.1281970739364624, "learning_rate": 2.3197563738629046e-05, "loss": 0.4923, "num_input_tokens_seen": 4971424, "step": 14770 }, { "epoch": 11.418083462132921, "grad_norm": 1.0901153087615967, "learning_rate": 2.3180748199298593e-05, "loss": 0.4743, "num_input_tokens_seen": 4973184, "step": 14775 }, { "epoch": 11.42194744976816, "grad_norm": 0.7802616953849792, "learning_rate": 2.3163933487374745e-05, "loss": 0.4884, "num_input_tokens_seen": 4974720, "step": 14780 }, { "epoch": 11.4258114374034, "grad_norm": 0.8547631502151489, "learning_rate": 2.3147119610504946e-05, "loss": 0.3996, "num_input_tokens_seen": 4976256, "step": 14785 }, { "epoch": 11.42967542503864, "grad_norm": 1.5064575672149658, "learning_rate": 2.313030657633625e-05, "loss": 0.3861, "num_input_tokens_seen": 4977792, "step": 14790 }, { "epoch": 11.433539412673879, "grad_norm": 1.0838415622711182, "learning_rate": 2.3113494392515324e-05, "loss": 0.5296, "num_input_tokens_seen": 4979424, "step": 14795 }, { "epoch": 11.437403400309119, "grad_norm": 1.2231703996658325, "learning_rate": 2.3096683066688438e-05, "loss": 0.4731, "num_input_tokens_seen": 4981280, "step": 14800 }, { "epoch": 11.44126738794436, "grad_norm": 1.1022776365280151, "learning_rate": 2.3079872606501495e-05, "loss": 0.4305, "num_input_tokens_seen": 4982848, "step": 14805 }, { "epoch": 11.445131375579598, "grad_norm": 1.0975148677825928, "learning_rate": 2.306306301959999e-05, "loss": 0.396, "num_input_tokens_seen": 4984576, "step": 14810 }, { "epoch": 11.448995363214838, "grad_norm": 0.8243637084960938, "learning_rate": 2.3046254313629023e-05, "loss": 0.6881, "num_input_tokens_seen": 4986208, "step": 14815 }, { "epoch": 11.452859350850078, "grad_norm": 1.2143579721450806, "learning_rate": 2.3029446496233286e-05, "loss": 0.4448, "num_input_tokens_seen": 4987616, "step": 14820 }, { "epoch": 11.456723338485316, "grad_norm": 0.8663111329078674, "learning_rate": 2.3012639575057092e-05, "loss": 0.4711, "num_input_tokens_seen": 4989312, "step": 14825 }, { "epoch": 11.460587326120557, "grad_norm": 1.7567787170410156, "learning_rate": 2.2995833557744326e-05, "loss": 0.6616, "num_input_tokens_seen": 4991072, "step": 14830 }, { "epoch": 11.464451313755795, "grad_norm": 0.8139151930809021, "learning_rate": 2.2979028451938452e-05, "loss": 0.3788, "num_input_tokens_seen": 4992608, "step": 14835 }, { "epoch": 11.468315301391035, "grad_norm": 0.9804201126098633, "learning_rate": 2.296222426528255e-05, "loss": 0.506, "num_input_tokens_seen": 4994240, "step": 14840 }, { "epoch": 11.472179289026275, "grad_norm": 1.1593265533447266, "learning_rate": 2.2945421005419252e-05, "loss": 0.4147, "num_input_tokens_seen": 4995840, "step": 14845 }, { "epoch": 11.476043276661514, "grad_norm": 1.009444236755371, "learning_rate": 2.292861867999078e-05, "loss": 0.6474, "num_input_tokens_seen": 4997472, "step": 14850 }, { "epoch": 11.479907264296754, "grad_norm": 1.3149937391281128, "learning_rate": 2.2911817296638947e-05, "loss": 0.4066, "num_input_tokens_seen": 4999072, "step": 14855 }, { "epoch": 11.483771251931994, "grad_norm": 1.4468973875045776, "learning_rate": 2.28950168630051e-05, "loss": 0.7909, "num_input_tokens_seen": 5000672, "step": 14860 }, { "epoch": 11.487635239567233, "grad_norm": 1.2655245065689087, "learning_rate": 2.2878217386730196e-05, "loss": 0.3776, "num_input_tokens_seen": 5002336, "step": 14865 }, { "epoch": 11.491499227202473, "grad_norm": 0.7541322708129883, "learning_rate": 2.286141887545473e-05, "loss": 0.3525, "num_input_tokens_seen": 5004064, "step": 14870 }, { "epoch": 11.495363214837713, "grad_norm": 0.6539074778556824, "learning_rate": 2.2844621336818774e-05, "loss": 0.39, "num_input_tokens_seen": 5005856, "step": 14875 }, { "epoch": 11.499227202472952, "grad_norm": 1.4155529737472534, "learning_rate": 2.282782477846194e-05, "loss": 0.5162, "num_input_tokens_seen": 5007744, "step": 14880 }, { "epoch": 11.503091190108192, "grad_norm": 0.7933775186538696, "learning_rate": 2.2811029208023403e-05, "loss": 0.4438, "num_input_tokens_seen": 5009312, "step": 14885 }, { "epoch": 11.506955177743432, "grad_norm": 1.0333582162857056, "learning_rate": 2.279423463314189e-05, "loss": 0.5153, "num_input_tokens_seen": 5011008, "step": 14890 }, { "epoch": 11.51081916537867, "grad_norm": 0.7311718463897705, "learning_rate": 2.277744106145568e-05, "loss": 0.443, "num_input_tokens_seen": 5012608, "step": 14895 }, { "epoch": 11.51468315301391, "grad_norm": 0.9832296967506409, "learning_rate": 2.276064850060258e-05, "loss": 0.3921, "num_input_tokens_seen": 5014432, "step": 14900 }, { "epoch": 11.51854714064915, "grad_norm": 0.6489081382751465, "learning_rate": 2.274385695821995e-05, "loss": 0.4192, "num_input_tokens_seen": 5015968, "step": 14905 }, { "epoch": 11.52241112828439, "grad_norm": 0.7496013641357422, "learning_rate": 2.2727066441944693e-05, "loss": 0.4293, "num_input_tokens_seen": 5017632, "step": 14910 }, { "epoch": 11.52627511591963, "grad_norm": 0.8273250460624695, "learning_rate": 2.2710276959413236e-05, "loss": 0.3986, "num_input_tokens_seen": 5019232, "step": 14915 }, { "epoch": 11.530139103554868, "grad_norm": 0.999221920967102, "learning_rate": 2.269348851826152e-05, "loss": 0.5511, "num_input_tokens_seen": 5021216, "step": 14920 }, { "epoch": 11.534003091190108, "grad_norm": 1.004419207572937, "learning_rate": 2.2676701126125044e-05, "loss": 0.4063, "num_input_tokens_seen": 5022816, "step": 14925 }, { "epoch": 11.537867078825348, "grad_norm": 0.9339966773986816, "learning_rate": 2.2659914790638813e-05, "loss": 0.3089, "num_input_tokens_seen": 5024320, "step": 14930 }, { "epoch": 11.541731066460587, "grad_norm": 0.8921641111373901, "learning_rate": 2.2643129519437344e-05, "loss": 0.4731, "num_input_tokens_seen": 5025760, "step": 14935 }, { "epoch": 11.545595054095827, "grad_norm": 0.8182867765426636, "learning_rate": 2.2626345320154676e-05, "loss": 0.3618, "num_input_tokens_seen": 5027296, "step": 14940 }, { "epoch": 11.549459041731067, "grad_norm": 0.962385356426239, "learning_rate": 2.2609562200424384e-05, "loss": 0.4158, "num_input_tokens_seen": 5028960, "step": 14945 }, { "epoch": 11.553323029366306, "grad_norm": 0.7941834926605225, "learning_rate": 2.2592780167879518e-05, "loss": 0.368, "num_input_tokens_seen": 5030496, "step": 14950 }, { "epoch": 11.557187017001546, "grad_norm": 0.760601282119751, "learning_rate": 2.2575999230152644e-05, "loss": 0.3878, "num_input_tokens_seen": 5032192, "step": 14955 }, { "epoch": 11.561051004636784, "grad_norm": 1.4708118438720703, "learning_rate": 2.255921939487584e-05, "loss": 0.58, "num_input_tokens_seen": 5033888, "step": 14960 }, { "epoch": 11.564914992272024, "grad_norm": 1.4548429250717163, "learning_rate": 2.2542440669680676e-05, "loss": 0.4245, "num_input_tokens_seen": 5035808, "step": 14965 }, { "epoch": 11.568778979907265, "grad_norm": 1.2739027738571167, "learning_rate": 2.2525663062198208e-05, "loss": 0.4894, "num_input_tokens_seen": 5037472, "step": 14970 }, { "epoch": 11.572642967542503, "grad_norm": 0.982111930847168, "learning_rate": 2.2508886580059e-05, "loss": 0.43, "num_input_tokens_seen": 5038976, "step": 14975 }, { "epoch": 11.576506955177743, "grad_norm": 0.9524388313293457, "learning_rate": 2.2492111230893085e-05, "loss": 0.4235, "num_input_tokens_seen": 5040576, "step": 14980 }, { "epoch": 11.580370942812984, "grad_norm": 1.2371597290039062, "learning_rate": 2.247533702233001e-05, "loss": 0.663, "num_input_tokens_seen": 5042304, "step": 14985 }, { "epoch": 11.584234930448222, "grad_norm": 1.2094980478286743, "learning_rate": 2.2458563961998775e-05, "loss": 0.4294, "num_input_tokens_seen": 5044160, "step": 14990 }, { "epoch": 11.588098918083462, "grad_norm": 0.721822202205658, "learning_rate": 2.2441792057527873e-05, "loss": 0.3534, "num_input_tokens_seen": 5045888, "step": 14995 }, { "epoch": 11.591962905718702, "grad_norm": 0.789691686630249, "learning_rate": 2.2425021316545262e-05, "loss": 0.3789, "num_input_tokens_seen": 5047712, "step": 15000 }, { "epoch": 11.59582689335394, "grad_norm": 0.8573911786079407, "learning_rate": 2.2408251746678374e-05, "loss": 0.4025, "num_input_tokens_seen": 5049376, "step": 15005 }, { "epoch": 11.599690880989181, "grad_norm": 0.9765946269035339, "learning_rate": 2.239148335555412e-05, "loss": 0.3573, "num_input_tokens_seen": 5050848, "step": 15010 }, { "epoch": 11.603554868624421, "grad_norm": 0.7311945557594299, "learning_rate": 2.2374716150798856e-05, "loss": 0.5128, "num_input_tokens_seen": 5052352, "step": 15015 }, { "epoch": 11.60741885625966, "grad_norm": 0.9665073156356812, "learning_rate": 2.2357950140038397e-05, "loss": 0.3877, "num_input_tokens_seen": 5054016, "step": 15020 }, { "epoch": 11.6112828438949, "grad_norm": 1.0984325408935547, "learning_rate": 2.2341185330898043e-05, "loss": 0.3858, "num_input_tokens_seen": 5055744, "step": 15025 }, { "epoch": 11.615146831530138, "grad_norm": 1.045177936553955, "learning_rate": 2.232442173100253e-05, "loss": 0.4165, "num_input_tokens_seen": 5057600, "step": 15030 }, { "epoch": 11.619010819165378, "grad_norm": 1.20137619972229, "learning_rate": 2.2307659347976033e-05, "loss": 0.562, "num_input_tokens_seen": 5059328, "step": 15035 }, { "epoch": 11.622874806800619, "grad_norm": 1.0413398742675781, "learning_rate": 2.229089818944219e-05, "loss": 0.4138, "num_input_tokens_seen": 5061056, "step": 15040 }, { "epoch": 11.626738794435857, "grad_norm": 0.6397876739501953, "learning_rate": 2.2274138263024074e-05, "loss": 0.4098, "num_input_tokens_seen": 5062848, "step": 15045 }, { "epoch": 11.630602782071097, "grad_norm": 1.5936278104782104, "learning_rate": 2.2257379576344203e-05, "loss": 0.4814, "num_input_tokens_seen": 5064672, "step": 15050 }, { "epoch": 11.634466769706338, "grad_norm": 1.4757492542266846, "learning_rate": 2.2240622137024522e-05, "loss": 0.4772, "num_input_tokens_seen": 5066368, "step": 15055 }, { "epoch": 11.638330757341576, "grad_norm": 0.6676020622253418, "learning_rate": 2.222386595268641e-05, "loss": 0.4097, "num_input_tokens_seen": 5068096, "step": 15060 }, { "epoch": 11.642194744976816, "grad_norm": 1.8278802633285522, "learning_rate": 2.22071110309507e-05, "loss": 0.3974, "num_input_tokens_seen": 5069728, "step": 15065 }, { "epoch": 11.646058732612056, "grad_norm": 1.0064833164215088, "learning_rate": 2.219035737943762e-05, "loss": 0.5247, "num_input_tokens_seen": 5071680, "step": 15070 }, { "epoch": 11.649922720247295, "grad_norm": 0.9730477929115295, "learning_rate": 2.2173605005766825e-05, "loss": 0.3473, "num_input_tokens_seen": 5073376, "step": 15075 }, { "epoch": 11.653786707882535, "grad_norm": 0.8058966398239136, "learning_rate": 2.21568539175574e-05, "loss": 0.507, "num_input_tokens_seen": 5075136, "step": 15080 }, { "epoch": 11.657650695517773, "grad_norm": 0.653827428817749, "learning_rate": 2.2140104122427848e-05, "loss": 0.4325, "num_input_tokens_seen": 5076768, "step": 15085 }, { "epoch": 11.661514683153014, "grad_norm": 1.1380640268325806, "learning_rate": 2.212335562799606e-05, "loss": 0.4274, "num_input_tokens_seen": 5078400, "step": 15090 }, { "epoch": 11.665378670788254, "grad_norm": 0.6947457790374756, "learning_rate": 2.2106608441879363e-05, "loss": 0.3514, "num_input_tokens_seen": 5080448, "step": 15095 }, { "epoch": 11.669242658423492, "grad_norm": 0.7259266972541809, "learning_rate": 2.2089862571694465e-05, "loss": 0.5203, "num_input_tokens_seen": 5082048, "step": 15100 }, { "epoch": 11.673106646058732, "grad_norm": 0.6728566884994507, "learning_rate": 2.207311802505751e-05, "loss": 0.3775, "num_input_tokens_seen": 5083616, "step": 15105 }, { "epoch": 11.676970633693973, "grad_norm": 0.5872761011123657, "learning_rate": 2.2056374809583998e-05, "loss": 0.3728, "num_input_tokens_seen": 5085344, "step": 15110 }, { "epoch": 11.680834621329211, "grad_norm": 0.7352548837661743, "learning_rate": 2.203963293288886e-05, "loss": 0.563, "num_input_tokens_seen": 5086752, "step": 15115 }, { "epoch": 11.684698608964451, "grad_norm": 1.1528483629226685, "learning_rate": 2.202289240258639e-05, "loss": 0.4028, "num_input_tokens_seen": 5088352, "step": 15120 }, { "epoch": 11.688562596599692, "grad_norm": 1.4783679246902466, "learning_rate": 2.200615322629028e-05, "loss": 0.4537, "num_input_tokens_seen": 5089984, "step": 15125 }, { "epoch": 11.69242658423493, "grad_norm": 0.8753348588943481, "learning_rate": 2.198941541161362e-05, "loss": 0.3954, "num_input_tokens_seen": 5091840, "step": 15130 }, { "epoch": 11.69629057187017, "grad_norm": 0.9608486890792847, "learning_rate": 2.1972678966168857e-05, "loss": 0.3749, "num_input_tokens_seen": 5093440, "step": 15135 }, { "epoch": 11.70015455950541, "grad_norm": 1.3709790706634521, "learning_rate": 2.195594389756782e-05, "loss": 0.5196, "num_input_tokens_seen": 5095136, "step": 15140 }, { "epoch": 11.704018547140649, "grad_norm": 0.8767039775848389, "learning_rate": 2.193921021342173e-05, "loss": 0.4156, "num_input_tokens_seen": 5096896, "step": 15145 }, { "epoch": 11.707882534775889, "grad_norm": 0.5047526955604553, "learning_rate": 2.192247792134118e-05, "loss": 0.3535, "num_input_tokens_seen": 5098880, "step": 15150 }, { "epoch": 11.711746522411127, "grad_norm": 1.3191710710525513, "learning_rate": 2.1905747028936093e-05, "loss": 0.5408, "num_input_tokens_seen": 5100576, "step": 15155 }, { "epoch": 11.715610510046368, "grad_norm": 0.7719799876213074, "learning_rate": 2.188901754381579e-05, "loss": 0.4741, "num_input_tokens_seen": 5102336, "step": 15160 }, { "epoch": 11.719474497681608, "grad_norm": 1.2654422521591187, "learning_rate": 2.187228947358894e-05, "loss": 0.6986, "num_input_tokens_seen": 5104128, "step": 15165 }, { "epoch": 11.723338485316846, "grad_norm": 0.7756655216217041, "learning_rate": 2.185556282586357e-05, "loss": 0.3816, "num_input_tokens_seen": 5105568, "step": 15170 }, { "epoch": 11.727202472952087, "grad_norm": 1.1682251691818237, "learning_rate": 2.183883760824705e-05, "loss": 0.4873, "num_input_tokens_seen": 5107232, "step": 15175 }, { "epoch": 11.731066460587327, "grad_norm": 0.6161404848098755, "learning_rate": 2.1822113828346124e-05, "loss": 0.5737, "num_input_tokens_seen": 5108928, "step": 15180 }, { "epoch": 11.734930448222565, "grad_norm": 1.2346421480178833, "learning_rate": 2.1805391493766854e-05, "loss": 0.4957, "num_input_tokens_seen": 5110656, "step": 15185 }, { "epoch": 11.738794435857805, "grad_norm": 0.9749569892883301, "learning_rate": 2.178867061211467e-05, "loss": 0.4669, "num_input_tokens_seen": 5112416, "step": 15190 }, { "epoch": 11.742658423493046, "grad_norm": 1.1337180137634277, "learning_rate": 2.177195119099432e-05, "loss": 0.5139, "num_input_tokens_seen": 5114272, "step": 15195 }, { "epoch": 11.746522411128284, "grad_norm": 0.7761427760124207, "learning_rate": 2.1755233238009904e-05, "loss": 0.5146, "num_input_tokens_seen": 5116000, "step": 15200 }, { "epoch": 11.750386398763524, "grad_norm": 0.7802093625068665, "learning_rate": 2.1738516760764843e-05, "loss": 0.3178, "num_input_tokens_seen": 5117568, "step": 15205 }, { "epoch": 11.754250386398763, "grad_norm": 1.5770134925842285, "learning_rate": 2.172180176686189e-05, "loss": 0.4644, "num_input_tokens_seen": 5119200, "step": 15210 }, { "epoch": 11.758114374034003, "grad_norm": 1.198026180267334, "learning_rate": 2.1705088263903127e-05, "loss": 0.3782, "num_input_tokens_seen": 5120800, "step": 15215 }, { "epoch": 11.761978361669243, "grad_norm": 0.9554994106292725, "learning_rate": 2.1688376259489958e-05, "loss": 0.667, "num_input_tokens_seen": 5122496, "step": 15220 }, { "epoch": 11.765842349304481, "grad_norm": 1.3860646486282349, "learning_rate": 2.1671665761223087e-05, "loss": 0.4584, "num_input_tokens_seen": 5124128, "step": 15225 }, { "epoch": 11.769706336939722, "grad_norm": 1.5333751440048218, "learning_rate": 2.1654956776702563e-05, "loss": 0.6542, "num_input_tokens_seen": 5125888, "step": 15230 }, { "epoch": 11.773570324574962, "grad_norm": 0.5655025243759155, "learning_rate": 2.1638249313527737e-05, "loss": 0.3748, "num_input_tokens_seen": 5127424, "step": 15235 }, { "epoch": 11.7774343122102, "grad_norm": 1.0238354206085205, "learning_rate": 2.1621543379297258e-05, "loss": 0.5226, "num_input_tokens_seen": 5128896, "step": 15240 }, { "epoch": 11.78129829984544, "grad_norm": 0.8911478519439697, "learning_rate": 2.1604838981609075e-05, "loss": 0.4013, "num_input_tokens_seen": 5130464, "step": 15245 }, { "epoch": 11.78516228748068, "grad_norm": 1.0320912599563599, "learning_rate": 2.158813612806046e-05, "loss": 0.4697, "num_input_tokens_seen": 5132224, "step": 15250 }, { "epoch": 11.78902627511592, "grad_norm": 1.1668727397918701, "learning_rate": 2.1571434826247973e-05, "loss": 0.5717, "num_input_tokens_seen": 5133984, "step": 15255 }, { "epoch": 11.79289026275116, "grad_norm": 1.419548511505127, "learning_rate": 2.1554735083767447e-05, "loss": 0.5523, "num_input_tokens_seen": 5135616, "step": 15260 }, { "epoch": 11.7967542503864, "grad_norm": 1.7926169633865356, "learning_rate": 2.153803690821404e-05, "loss": 0.6238, "num_input_tokens_seen": 5137216, "step": 15265 }, { "epoch": 11.800618238021638, "grad_norm": 1.2311902046203613, "learning_rate": 2.152134030718218e-05, "loss": 0.4067, "num_input_tokens_seen": 5139104, "step": 15270 }, { "epoch": 11.804482225656878, "grad_norm": 0.9631800055503845, "learning_rate": 2.150464528826559e-05, "loss": 0.3792, "num_input_tokens_seen": 5140768, "step": 15275 }, { "epoch": 11.808346213292117, "grad_norm": 0.7888791561126709, "learning_rate": 2.1487951859057248e-05, "loss": 0.4605, "num_input_tokens_seen": 5142592, "step": 15280 }, { "epoch": 11.812210200927357, "grad_norm": 1.1511303186416626, "learning_rate": 2.147126002714944e-05, "loss": 0.404, "num_input_tokens_seen": 5144064, "step": 15285 }, { "epoch": 11.816074188562597, "grad_norm": 0.7554580569267273, "learning_rate": 2.14545698001337e-05, "loss": 0.4238, "num_input_tokens_seen": 5145920, "step": 15290 }, { "epoch": 11.819938176197835, "grad_norm": 0.8886721134185791, "learning_rate": 2.1437881185600845e-05, "loss": 0.4603, "num_input_tokens_seen": 5147616, "step": 15295 }, { "epoch": 11.823802163833076, "grad_norm": 1.1562435626983643, "learning_rate": 2.1421194191140965e-05, "loss": 0.5892, "num_input_tokens_seen": 5149504, "step": 15300 }, { "epoch": 11.827666151468316, "grad_norm": 1.559726357460022, "learning_rate": 2.1404508824343388e-05, "loss": 0.4547, "num_input_tokens_seen": 5150944, "step": 15305 }, { "epoch": 11.831530139103554, "grad_norm": 0.7207401990890503, "learning_rate": 2.1387825092796742e-05, "loss": 0.6812, "num_input_tokens_seen": 5152768, "step": 15310 }, { "epoch": 11.835394126738795, "grad_norm": 1.1154192686080933, "learning_rate": 2.137114300408888e-05, "loss": 0.3949, "num_input_tokens_seen": 5154816, "step": 15315 }, { "epoch": 11.839258114374035, "grad_norm": 2.493115186691284, "learning_rate": 2.13544625658069e-05, "loss": 0.4855, "num_input_tokens_seen": 5156416, "step": 15320 }, { "epoch": 11.843122102009273, "grad_norm": 0.982314944267273, "learning_rate": 2.1337783785537184e-05, "loss": 0.3869, "num_input_tokens_seen": 5158304, "step": 15325 }, { "epoch": 11.846986089644513, "grad_norm": 1.225705862045288, "learning_rate": 2.1321106670865332e-05, "loss": 0.6664, "num_input_tokens_seen": 5159904, "step": 15330 }, { "epoch": 11.850850077279752, "grad_norm": 0.8712247610092163, "learning_rate": 2.1304431229376204e-05, "loss": 0.4073, "num_input_tokens_seen": 5161664, "step": 15335 }, { "epoch": 11.854714064914992, "grad_norm": 0.7898816466331482, "learning_rate": 2.1287757468653882e-05, "loss": 0.3589, "num_input_tokens_seen": 5163456, "step": 15340 }, { "epoch": 11.858578052550232, "grad_norm": 0.8877303600311279, "learning_rate": 2.1271085396281684e-05, "loss": 0.3557, "num_input_tokens_seen": 5165152, "step": 15345 }, { "epoch": 11.86244204018547, "grad_norm": 0.7768025994300842, "learning_rate": 2.1254415019842193e-05, "loss": 0.3795, "num_input_tokens_seen": 5166784, "step": 15350 }, { "epoch": 11.86630602782071, "grad_norm": 0.9558967351913452, "learning_rate": 2.1237746346917174e-05, "loss": 0.4323, "num_input_tokens_seen": 5168352, "step": 15355 }, { "epoch": 11.870170015455951, "grad_norm": 1.1263478994369507, "learning_rate": 2.1221079385087654e-05, "loss": 0.5066, "num_input_tokens_seen": 5170304, "step": 15360 }, { "epoch": 11.87403400309119, "grad_norm": 1.092140555381775, "learning_rate": 2.1204414141933863e-05, "loss": 0.478, "num_input_tokens_seen": 5172128, "step": 15365 }, { "epoch": 11.87789799072643, "grad_norm": 1.0088194608688354, "learning_rate": 2.118775062503524e-05, "loss": 0.4186, "num_input_tokens_seen": 5173888, "step": 15370 }, { "epoch": 11.88176197836167, "grad_norm": 1.2054332494735718, "learning_rate": 2.1171088841970477e-05, "loss": 0.393, "num_input_tokens_seen": 5175616, "step": 15375 }, { "epoch": 11.885625965996908, "grad_norm": 0.9718557000160217, "learning_rate": 2.115442880031743e-05, "loss": 0.3828, "num_input_tokens_seen": 5177120, "step": 15380 }, { "epoch": 11.889489953632149, "grad_norm": 0.9792500734329224, "learning_rate": 2.1137770507653192e-05, "loss": 0.3769, "num_input_tokens_seen": 5178976, "step": 15385 }, { "epoch": 11.893353941267389, "grad_norm": 1.3485289812088013, "learning_rate": 2.1121113971554057e-05, "loss": 0.4, "num_input_tokens_seen": 5180672, "step": 15390 }, { "epoch": 11.897217928902627, "grad_norm": 0.9868807792663574, "learning_rate": 2.1104459199595527e-05, "loss": 0.5022, "num_input_tokens_seen": 5182240, "step": 15395 }, { "epoch": 11.901081916537867, "grad_norm": 0.88093101978302, "learning_rate": 2.1087806199352282e-05, "loss": 0.4705, "num_input_tokens_seen": 5183872, "step": 15400 }, { "epoch": 11.904945904173106, "grad_norm": 0.7881690859794617, "learning_rate": 2.10711549783982e-05, "loss": 0.3559, "num_input_tokens_seen": 5185664, "step": 15405 }, { "epoch": 11.908809891808346, "grad_norm": 1.0551879405975342, "learning_rate": 2.1054505544306367e-05, "loss": 0.3918, "num_input_tokens_seen": 5187168, "step": 15410 }, { "epoch": 11.912673879443586, "grad_norm": 0.7983222603797913, "learning_rate": 2.1037857904649043e-05, "loss": 0.4005, "num_input_tokens_seen": 5188832, "step": 15415 }, { "epoch": 11.916537867078825, "grad_norm": 1.167359471321106, "learning_rate": 2.1021212066997664e-05, "loss": 0.4326, "num_input_tokens_seen": 5190400, "step": 15420 }, { "epoch": 11.920401854714065, "grad_norm": 1.0054153203964233, "learning_rate": 2.1004568038922863e-05, "loss": 0.6065, "num_input_tokens_seen": 5192288, "step": 15425 }, { "epoch": 11.924265842349305, "grad_norm": 0.7413691282272339, "learning_rate": 2.0987925827994457e-05, "loss": 0.4377, "num_input_tokens_seen": 5194144, "step": 15430 }, { "epoch": 11.928129829984544, "grad_norm": 0.7110912799835205, "learning_rate": 2.0971285441781407e-05, "loss": 0.3506, "num_input_tokens_seen": 5195616, "step": 15435 }, { "epoch": 11.931993817619784, "grad_norm": 0.867365837097168, "learning_rate": 2.0954646887851865e-05, "loss": 0.405, "num_input_tokens_seen": 5197216, "step": 15440 }, { "epoch": 11.935857805255024, "grad_norm": 1.049264669418335, "learning_rate": 2.093801017377315e-05, "loss": 0.4581, "num_input_tokens_seen": 5198880, "step": 15445 }, { "epoch": 11.939721792890262, "grad_norm": 0.8913770914077759, "learning_rate": 2.092137530711173e-05, "loss": 0.4889, "num_input_tokens_seen": 5200416, "step": 15450 }, { "epoch": 11.943585780525503, "grad_norm": 0.9814773797988892, "learning_rate": 2.0904742295433245e-05, "loss": 0.527, "num_input_tokens_seen": 5202112, "step": 15455 }, { "epoch": 11.947449768160741, "grad_norm": 0.9017194509506226, "learning_rate": 2.0888111146302493e-05, "loss": 0.4459, "num_input_tokens_seen": 5204064, "step": 15460 }, { "epoch": 11.951313755795981, "grad_norm": 1.2948224544525146, "learning_rate": 2.0871481867283404e-05, "loss": 0.3724, "num_input_tokens_seen": 5205504, "step": 15465 }, { "epoch": 11.955177743431221, "grad_norm": 1.8532836437225342, "learning_rate": 2.08548544659391e-05, "loss": 0.4284, "num_input_tokens_seen": 5207456, "step": 15470 }, { "epoch": 11.95904173106646, "grad_norm": 1.4074289798736572, "learning_rate": 2.0838228949831803e-05, "loss": 0.6368, "num_input_tokens_seen": 5209312, "step": 15475 }, { "epoch": 11.9629057187017, "grad_norm": 1.2498472929000854, "learning_rate": 2.0821605326522908e-05, "loss": 0.4477, "num_input_tokens_seen": 5211136, "step": 15480 }, { "epoch": 11.96676970633694, "grad_norm": 0.745514988899231, "learning_rate": 2.0804983603572935e-05, "loss": 0.4308, "num_input_tokens_seen": 5212992, "step": 15485 }, { "epoch": 11.970633693972179, "grad_norm": 2.0529611110687256, "learning_rate": 2.078836378854154e-05, "loss": 0.5074, "num_input_tokens_seen": 5214560, "step": 15490 }, { "epoch": 11.974497681607419, "grad_norm": 0.9707750082015991, "learning_rate": 2.0771745888987515e-05, "loss": 0.3941, "num_input_tokens_seen": 5216416, "step": 15495 }, { "epoch": 11.978361669242659, "grad_norm": 0.8408107757568359, "learning_rate": 2.0755129912468787e-05, "loss": 0.4009, "num_input_tokens_seen": 5218304, "step": 15500 }, { "epoch": 11.982225656877898, "grad_norm": 0.7503194808959961, "learning_rate": 2.0738515866542385e-05, "loss": 0.4848, "num_input_tokens_seen": 5219936, "step": 15505 }, { "epoch": 11.986089644513138, "grad_norm": 1.2081913948059082, "learning_rate": 2.072190375876449e-05, "loss": 0.605, "num_input_tokens_seen": 5221472, "step": 15510 }, { "epoch": 11.989953632148378, "grad_norm": 0.914701521396637, "learning_rate": 2.0705293596690395e-05, "loss": 0.504, "num_input_tokens_seen": 5222976, "step": 15515 }, { "epoch": 11.993817619783616, "grad_norm": 1.6826276779174805, "learning_rate": 2.068868538787449e-05, "loss": 0.4751, "num_input_tokens_seen": 5224608, "step": 15520 }, { "epoch": 11.997681607418857, "grad_norm": 0.8794524073600769, "learning_rate": 2.0672079139870287e-05, "loss": 0.5518, "num_input_tokens_seen": 5226176, "step": 15525 }, { "epoch": 12.0, "eval_loss": 0.46775001287460327, "eval_runtime": 6.3651, "eval_samples_per_second": 90.336, "eval_steps_per_second": 22.623, "num_input_tokens_seen": 5227040, "step": 15528 }, { "epoch": 12.001545595054095, "grad_norm": 0.8731666803359985, "learning_rate": 2.0655474860230413e-05, "loss": 0.4017, "num_input_tokens_seen": 5227744, "step": 15530 }, { "epoch": 12.005409582689335, "grad_norm": 1.1558313369750977, "learning_rate": 2.0638872556506592e-05, "loss": 0.3825, "num_input_tokens_seen": 5229440, "step": 15535 }, { "epoch": 12.009273570324575, "grad_norm": 1.2481865882873535, "learning_rate": 2.0622272236249646e-05, "loss": 0.4248, "num_input_tokens_seen": 5231072, "step": 15540 }, { "epoch": 12.013137557959814, "grad_norm": 1.063848614692688, "learning_rate": 2.0605673907009495e-05, "loss": 0.6733, "num_input_tokens_seen": 5232832, "step": 15545 }, { "epoch": 12.017001545595054, "grad_norm": 0.9588082432746887, "learning_rate": 2.058907757633518e-05, "loss": 0.5897, "num_input_tokens_seen": 5234752, "step": 15550 }, { "epoch": 12.020865533230294, "grad_norm": 1.033333659172058, "learning_rate": 2.05724832517748e-05, "loss": 0.4069, "num_input_tokens_seen": 5236128, "step": 15555 }, { "epoch": 12.024729520865533, "grad_norm": 1.056540608406067, "learning_rate": 2.0555890940875548e-05, "loss": 0.3524, "num_input_tokens_seen": 5237824, "step": 15560 }, { "epoch": 12.028593508500773, "grad_norm": 0.5547590255737305, "learning_rate": 2.0539300651183715e-05, "loss": 0.3401, "num_input_tokens_seen": 5239616, "step": 15565 }, { "epoch": 12.032457496136013, "grad_norm": 2.000777006149292, "learning_rate": 2.0522712390244662e-05, "loss": 0.395, "num_input_tokens_seen": 5241152, "step": 15570 }, { "epoch": 12.036321483771252, "grad_norm": 1.6023553609848022, "learning_rate": 2.0506126165602816e-05, "loss": 0.4006, "num_input_tokens_seen": 5242560, "step": 15575 }, { "epoch": 12.040185471406492, "grad_norm": 1.114161729812622, "learning_rate": 2.0489541984801717e-05, "loss": 0.3628, "num_input_tokens_seen": 5244096, "step": 15580 }, { "epoch": 12.04404945904173, "grad_norm": 0.6636527180671692, "learning_rate": 2.0472959855383916e-05, "loss": 0.4169, "num_input_tokens_seen": 5245504, "step": 15585 }, { "epoch": 12.04791344667697, "grad_norm": 1.0058808326721191, "learning_rate": 2.04563797848911e-05, "loss": 0.4409, "num_input_tokens_seen": 5247424, "step": 15590 }, { "epoch": 12.05177743431221, "grad_norm": 1.1294001340866089, "learning_rate": 2.0439801780863963e-05, "loss": 0.3649, "num_input_tokens_seen": 5249184, "step": 15595 }, { "epoch": 12.055641421947449, "grad_norm": 1.8210854530334473, "learning_rate": 2.042322585084229e-05, "loss": 0.4866, "num_input_tokens_seen": 5251040, "step": 15600 }, { "epoch": 12.05950540958269, "grad_norm": 0.7962170243263245, "learning_rate": 2.040665200236491e-05, "loss": 0.3824, "num_input_tokens_seen": 5252864, "step": 15605 }, { "epoch": 12.06336939721793, "grad_norm": 0.8415445685386658, "learning_rate": 2.0390080242969702e-05, "loss": 0.4466, "num_input_tokens_seen": 5254656, "step": 15610 }, { "epoch": 12.067233384853168, "grad_norm": 0.6798498630523682, "learning_rate": 2.037351058019361e-05, "loss": 0.5187, "num_input_tokens_seen": 5256288, "step": 15615 }, { "epoch": 12.071097372488408, "grad_norm": 0.8493430614471436, "learning_rate": 2.0356943021572617e-05, "loss": 0.4506, "num_input_tokens_seen": 5258144, "step": 15620 }, { "epoch": 12.074961360123648, "grad_norm": 1.353190541267395, "learning_rate": 2.0340377574641734e-05, "loss": 0.4116, "num_input_tokens_seen": 5259680, "step": 15625 }, { "epoch": 12.078825347758887, "grad_norm": 1.4273030757904053, "learning_rate": 2.0323814246935036e-05, "loss": 0.644, "num_input_tokens_seen": 5261568, "step": 15630 }, { "epoch": 12.082689335394127, "grad_norm": 1.1584272384643555, "learning_rate": 2.030725304598563e-05, "loss": 0.3958, "num_input_tokens_seen": 5263040, "step": 15635 }, { "epoch": 12.086553323029367, "grad_norm": 1.3747555017471313, "learning_rate": 2.0290693979325646e-05, "loss": 0.4411, "num_input_tokens_seen": 5264896, "step": 15640 }, { "epoch": 12.090417310664606, "grad_norm": 0.7602180242538452, "learning_rate": 2.0274137054486232e-05, "loss": 0.3504, "num_input_tokens_seen": 5266592, "step": 15645 }, { "epoch": 12.094281298299846, "grad_norm": 0.7697697281837463, "learning_rate": 2.02575822789976e-05, "loss": 0.4803, "num_input_tokens_seen": 5268160, "step": 15650 }, { "epoch": 12.098145285935084, "grad_norm": 0.6462953686714172, "learning_rate": 2.0241029660388943e-05, "loss": 0.5298, "num_input_tokens_seen": 5269792, "step": 15655 }, { "epoch": 12.102009273570324, "grad_norm": 0.9823091626167297, "learning_rate": 2.0224479206188496e-05, "loss": 0.3638, "num_input_tokens_seen": 5271488, "step": 15660 }, { "epoch": 12.105873261205565, "grad_norm": 1.3124139308929443, "learning_rate": 2.0207930923923497e-05, "loss": 0.355, "num_input_tokens_seen": 5272768, "step": 15665 }, { "epoch": 12.109737248840803, "grad_norm": 0.5912295579910278, "learning_rate": 2.0191384821120225e-05, "loss": 0.6263, "num_input_tokens_seen": 5274496, "step": 15670 }, { "epoch": 12.113601236476043, "grad_norm": 1.4227800369262695, "learning_rate": 2.0174840905303933e-05, "loss": 0.4227, "num_input_tokens_seen": 5276032, "step": 15675 }, { "epoch": 12.117465224111283, "grad_norm": 0.7823850512504578, "learning_rate": 2.0158299183998887e-05, "loss": 0.3602, "num_input_tokens_seen": 5277888, "step": 15680 }, { "epoch": 12.121329211746522, "grad_norm": 0.8688614368438721, "learning_rate": 2.0141759664728376e-05, "loss": 0.3874, "num_input_tokens_seen": 5279584, "step": 15685 }, { "epoch": 12.125193199381762, "grad_norm": 1.34271240234375, "learning_rate": 2.012522235501466e-05, "loss": 0.4518, "num_input_tokens_seen": 5281248, "step": 15690 }, { "epoch": 12.129057187017002, "grad_norm": 0.7379155158996582, "learning_rate": 2.010868726237901e-05, "loss": 0.3955, "num_input_tokens_seen": 5283008, "step": 15695 }, { "epoch": 12.13292117465224, "grad_norm": 1.1065728664398193, "learning_rate": 2.009215439434169e-05, "loss": 0.5007, "num_input_tokens_seen": 5284800, "step": 15700 }, { "epoch": 12.136785162287481, "grad_norm": 1.0089627504348755, "learning_rate": 2.007562375842193e-05, "loss": 0.3808, "num_input_tokens_seen": 5286656, "step": 15705 }, { "epoch": 12.14064914992272, "grad_norm": 0.6955393552780151, "learning_rate": 2.005909536213799e-05, "loss": 0.4551, "num_input_tokens_seen": 5288288, "step": 15710 }, { "epoch": 12.14451313755796, "grad_norm": 0.8325414657592773, "learning_rate": 2.0042569213007064e-05, "loss": 0.5625, "num_input_tokens_seen": 5290176, "step": 15715 }, { "epoch": 12.1483771251932, "grad_norm": 1.7040212154388428, "learning_rate": 2.002604531854535e-05, "loss": 0.4206, "num_input_tokens_seen": 5291872, "step": 15720 }, { "epoch": 12.152241112828438, "grad_norm": 0.7357850074768066, "learning_rate": 2.000952368626802e-05, "loss": 0.4341, "num_input_tokens_seen": 5293504, "step": 15725 }, { "epoch": 12.156105100463678, "grad_norm": 1.0791418552398682, "learning_rate": 1.9993004323689193e-05, "loss": 0.4118, "num_input_tokens_seen": 5295136, "step": 15730 }, { "epoch": 12.159969088098919, "grad_norm": 0.9893455505371094, "learning_rate": 1.997648723832199e-05, "loss": 0.3673, "num_input_tokens_seen": 5296672, "step": 15735 }, { "epoch": 12.163833075734157, "grad_norm": 1.1496264934539795, "learning_rate": 1.995997243767848e-05, "loss": 0.3581, "num_input_tokens_seen": 5298496, "step": 15740 }, { "epoch": 12.167697063369397, "grad_norm": 0.6440170407295227, "learning_rate": 1.994345992926968e-05, "loss": 0.3733, "num_input_tokens_seen": 5299968, "step": 15745 }, { "epoch": 12.171561051004637, "grad_norm": 0.883155107498169, "learning_rate": 1.9926949720605587e-05, "loss": 0.5293, "num_input_tokens_seen": 5301440, "step": 15750 }, { "epoch": 12.175425038639876, "grad_norm": 1.0237345695495605, "learning_rate": 1.9910441819195146e-05, "loss": 0.3716, "num_input_tokens_seen": 5302976, "step": 15755 }, { "epoch": 12.179289026275116, "grad_norm": 0.8436754941940308, "learning_rate": 1.989393623254625e-05, "loss": 0.471, "num_input_tokens_seen": 5304640, "step": 15760 }, { "epoch": 12.183153013910356, "grad_norm": 0.5622313022613525, "learning_rate": 1.9877432968165728e-05, "loss": 0.443, "num_input_tokens_seen": 5306560, "step": 15765 }, { "epoch": 12.187017001545595, "grad_norm": 1.090035319328308, "learning_rate": 1.9860932033559377e-05, "loss": 0.4047, "num_input_tokens_seen": 5308064, "step": 15770 }, { "epoch": 12.190880989180835, "grad_norm": 1.050630807876587, "learning_rate": 1.984443343623191e-05, "loss": 0.36, "num_input_tokens_seen": 5309792, "step": 15775 }, { "epoch": 12.194744976816073, "grad_norm": 0.6276386380195618, "learning_rate": 1.982793718368699e-05, "loss": 0.639, "num_input_tokens_seen": 5311680, "step": 15780 }, { "epoch": 12.198608964451314, "grad_norm": 1.0605409145355225, "learning_rate": 1.9811443283427205e-05, "loss": 0.3837, "num_input_tokens_seen": 5313312, "step": 15785 }, { "epoch": 12.202472952086554, "grad_norm": 1.1285394430160522, "learning_rate": 1.9794951742954098e-05, "loss": 0.4665, "num_input_tokens_seen": 5314912, "step": 15790 }, { "epoch": 12.206336939721792, "grad_norm": 1.128738284111023, "learning_rate": 1.9778462569768113e-05, "loss": 0.4759, "num_input_tokens_seen": 5316448, "step": 15795 }, { "epoch": 12.210200927357032, "grad_norm": 0.8472760319709778, "learning_rate": 1.9761975771368615e-05, "loss": 0.3522, "num_input_tokens_seen": 5317888, "step": 15800 }, { "epoch": 12.214064914992273, "grad_norm": 0.9623802304267883, "learning_rate": 1.974549135525391e-05, "loss": 0.5611, "num_input_tokens_seen": 5319424, "step": 15805 }, { "epoch": 12.217928902627511, "grad_norm": 0.48630261421203613, "learning_rate": 1.9729009328921205e-05, "loss": 0.4362, "num_input_tokens_seen": 5321120, "step": 15810 }, { "epoch": 12.221792890262751, "grad_norm": 0.7767500281333923, "learning_rate": 1.971252969986662e-05, "loss": 0.4508, "num_input_tokens_seen": 5322912, "step": 15815 }, { "epoch": 12.225656877897991, "grad_norm": 1.2043731212615967, "learning_rate": 1.9696052475585196e-05, "loss": 0.4027, "num_input_tokens_seen": 5324640, "step": 15820 }, { "epoch": 12.22952086553323, "grad_norm": 1.6272464990615845, "learning_rate": 1.9679577663570863e-05, "loss": 0.591, "num_input_tokens_seen": 5326304, "step": 15825 }, { "epoch": 12.23338485316847, "grad_norm": 1.1179394721984863, "learning_rate": 1.966310527131648e-05, "loss": 0.5956, "num_input_tokens_seen": 5328000, "step": 15830 }, { "epoch": 12.237248840803709, "grad_norm": 1.3357877731323242, "learning_rate": 1.9646635306313777e-05, "loss": 0.4224, "num_input_tokens_seen": 5329664, "step": 15835 }, { "epoch": 12.241112828438949, "grad_norm": 0.8967232704162598, "learning_rate": 1.96301677760534e-05, "loss": 0.3612, "num_input_tokens_seen": 5331200, "step": 15840 }, { "epoch": 12.244976816074189, "grad_norm": 0.9707668423652649, "learning_rate": 1.9613702688024877e-05, "loss": 0.6106, "num_input_tokens_seen": 5332960, "step": 15845 }, { "epoch": 12.248840803709427, "grad_norm": 1.2498579025268555, "learning_rate": 1.9597240049716625e-05, "loss": 0.5379, "num_input_tokens_seen": 5334784, "step": 15850 }, { "epoch": 12.252704791344668, "grad_norm": 1.0886653661727905, "learning_rate": 1.958077986861596e-05, "loss": 0.359, "num_input_tokens_seen": 5336448, "step": 15855 }, { "epoch": 12.256568778979908, "grad_norm": 1.065354347229004, "learning_rate": 1.9564322152209065e-05, "loss": 0.4258, "num_input_tokens_seen": 5337984, "step": 15860 }, { "epoch": 12.260432766615146, "grad_norm": 0.9959069490432739, "learning_rate": 1.9547866907980993e-05, "loss": 0.5403, "num_input_tokens_seen": 5339616, "step": 15865 }, { "epoch": 12.264296754250386, "grad_norm": 0.9873225688934326, "learning_rate": 1.9531414143415715e-05, "loss": 0.3156, "num_input_tokens_seen": 5341024, "step": 15870 }, { "epoch": 12.268160741885627, "grad_norm": 1.7160000801086426, "learning_rate": 1.9514963865996034e-05, "loss": 0.7038, "num_input_tokens_seen": 5342656, "step": 15875 }, { "epoch": 12.272024729520865, "grad_norm": 0.6056385636329651, "learning_rate": 1.949851608320364e-05, "loss": 0.3535, "num_input_tokens_seen": 5344480, "step": 15880 }, { "epoch": 12.275888717156105, "grad_norm": 0.9760804772377014, "learning_rate": 1.948207080251907e-05, "loss": 0.3632, "num_input_tokens_seen": 5346080, "step": 15885 }, { "epoch": 12.279752704791346, "grad_norm": 1.683011770248413, "learning_rate": 1.946562803142175e-05, "loss": 0.4455, "num_input_tokens_seen": 5347616, "step": 15890 }, { "epoch": 12.283616692426584, "grad_norm": 0.754531741142273, "learning_rate": 1.944918777738995e-05, "loss": 0.4973, "num_input_tokens_seen": 5349088, "step": 15895 }, { "epoch": 12.287480680061824, "grad_norm": 0.6606409549713135, "learning_rate": 1.943275004790078e-05, "loss": 0.5847, "num_input_tokens_seen": 5350976, "step": 15900 }, { "epoch": 12.291344667697063, "grad_norm": 0.8538486957550049, "learning_rate": 1.9416314850430224e-05, "loss": 0.5909, "num_input_tokens_seen": 5353088, "step": 15905 }, { "epoch": 12.295208655332303, "grad_norm": 1.6944448947906494, "learning_rate": 1.9399882192453127e-05, "loss": 0.7966, "num_input_tokens_seen": 5354752, "step": 15910 }, { "epoch": 12.299072642967543, "grad_norm": 0.848943293094635, "learning_rate": 1.938345208144315e-05, "loss": 0.4105, "num_input_tokens_seen": 5356416, "step": 15915 }, { "epoch": 12.302936630602781, "grad_norm": 0.8264021277427673, "learning_rate": 1.936702452487279e-05, "loss": 0.3895, "num_input_tokens_seen": 5357952, "step": 15920 }, { "epoch": 12.306800618238022, "grad_norm": 0.744295060634613, "learning_rate": 1.935059953021342e-05, "loss": 0.4414, "num_input_tokens_seen": 5359712, "step": 15925 }, { "epoch": 12.310664605873262, "grad_norm": 1.0191706418991089, "learning_rate": 1.9334177104935218e-05, "loss": 0.4166, "num_input_tokens_seen": 5361792, "step": 15930 }, { "epoch": 12.3145285935085, "grad_norm": 1.1299477815628052, "learning_rate": 1.931775725650719e-05, "loss": 0.4069, "num_input_tokens_seen": 5363328, "step": 15935 }, { "epoch": 12.31839258114374, "grad_norm": 1.059721827507019, "learning_rate": 1.93013399923972e-05, "loss": 0.4879, "num_input_tokens_seen": 5365376, "step": 15940 }, { "epoch": 12.32225656877898, "grad_norm": 1.0054349899291992, "learning_rate": 1.9284925320071898e-05, "loss": 0.4595, "num_input_tokens_seen": 5366976, "step": 15945 }, { "epoch": 12.326120556414219, "grad_norm": 1.014601469039917, "learning_rate": 1.92685132469968e-05, "loss": 0.3993, "num_input_tokens_seen": 5368832, "step": 15950 }, { "epoch": 12.32998454404946, "grad_norm": 0.9155606627464294, "learning_rate": 1.9252103780636192e-05, "loss": 0.607, "num_input_tokens_seen": 5370624, "step": 15955 }, { "epoch": 12.333848531684698, "grad_norm": 0.9594824314117432, "learning_rate": 1.9235696928453212e-05, "loss": 0.6058, "num_input_tokens_seen": 5372288, "step": 15960 }, { "epoch": 12.337712519319938, "grad_norm": 1.2907263040542603, "learning_rate": 1.9219292697909794e-05, "loss": 0.4241, "num_input_tokens_seen": 5374080, "step": 15965 }, { "epoch": 12.341576506955178, "grad_norm": 0.6449705362319946, "learning_rate": 1.920289109646667e-05, "loss": 0.504, "num_input_tokens_seen": 5375968, "step": 15970 }, { "epoch": 12.345440494590417, "grad_norm": 1.033062219619751, "learning_rate": 1.9186492131583395e-05, "loss": 0.4056, "num_input_tokens_seen": 5377472, "step": 15975 }, { "epoch": 12.349304482225657, "grad_norm": 1.3175979852676392, "learning_rate": 1.9170095810718318e-05, "loss": 0.4485, "num_input_tokens_seen": 5379200, "step": 15980 }, { "epoch": 12.353168469860897, "grad_norm": 0.7826077938079834, "learning_rate": 1.9153702141328567e-05, "loss": 0.5768, "num_input_tokens_seen": 5381056, "step": 15985 }, { "epoch": 12.357032457496135, "grad_norm": 1.546846628189087, "learning_rate": 1.9137311130870104e-05, "loss": 0.7275, "num_input_tokens_seen": 5382624, "step": 15990 }, { "epoch": 12.360896445131376, "grad_norm": 0.6696969866752625, "learning_rate": 1.9120922786797648e-05, "loss": 0.5485, "num_input_tokens_seen": 5384320, "step": 15995 }, { "epoch": 12.364760432766616, "grad_norm": 1.46556556224823, "learning_rate": 1.910453711656472e-05, "loss": 0.5636, "num_input_tokens_seen": 5386176, "step": 16000 }, { "epoch": 12.368624420401854, "grad_norm": 1.0433255434036255, "learning_rate": 1.9088154127623615e-05, "loss": 0.4937, "num_input_tokens_seen": 5387872, "step": 16005 }, { "epoch": 12.372488408037094, "grad_norm": 0.6778165102005005, "learning_rate": 1.907177382742542e-05, "loss": 0.3944, "num_input_tokens_seen": 5389632, "step": 16010 }, { "epoch": 12.376352395672335, "grad_norm": 0.7460423111915588, "learning_rate": 1.905539622341999e-05, "loss": 0.401, "num_input_tokens_seen": 5391168, "step": 16015 }, { "epoch": 12.380216383307573, "grad_norm": 1.2309132814407349, "learning_rate": 1.9039021323055956e-05, "loss": 0.447, "num_input_tokens_seen": 5393024, "step": 16020 }, { "epoch": 12.384080370942813, "grad_norm": 0.7239397764205933, "learning_rate": 1.902264913378072e-05, "loss": 0.3777, "num_input_tokens_seen": 5394784, "step": 16025 }, { "epoch": 12.387944358578052, "grad_norm": 2.171853542327881, "learning_rate": 1.9006279663040458e-05, "loss": 0.5038, "num_input_tokens_seen": 5396928, "step": 16030 }, { "epoch": 12.391808346213292, "grad_norm": 0.8980265855789185, "learning_rate": 1.8989912918280102e-05, "loss": 0.4079, "num_input_tokens_seen": 5398464, "step": 16035 }, { "epoch": 12.395672333848532, "grad_norm": 1.7087563276290894, "learning_rate": 1.897354890694335e-05, "loss": 0.3266, "num_input_tokens_seen": 5400224, "step": 16040 }, { "epoch": 12.39953632148377, "grad_norm": 0.6426692008972168, "learning_rate": 1.8957187636472635e-05, "loss": 0.3623, "num_input_tokens_seen": 5401952, "step": 16045 }, { "epoch": 12.40340030911901, "grad_norm": 1.2569254636764526, "learning_rate": 1.894082911430918e-05, "loss": 0.4089, "num_input_tokens_seen": 5403744, "step": 16050 }, { "epoch": 12.407264296754251, "grad_norm": 1.434838056564331, "learning_rate": 1.8924473347892922e-05, "loss": 0.4447, "num_input_tokens_seen": 5405600, "step": 16055 }, { "epoch": 12.41112828438949, "grad_norm": 0.7474325895309448, "learning_rate": 1.890812034466258e-05, "loss": 0.5178, "num_input_tokens_seen": 5407072, "step": 16060 }, { "epoch": 12.41499227202473, "grad_norm": 1.2940410375595093, "learning_rate": 1.8891770112055576e-05, "loss": 0.5597, "num_input_tokens_seen": 5408768, "step": 16065 }, { "epoch": 12.41885625965997, "grad_norm": 0.7478259801864624, "learning_rate": 1.8875422657508115e-05, "loss": 0.4265, "num_input_tokens_seen": 5410336, "step": 16070 }, { "epoch": 12.422720247295208, "grad_norm": 0.882434070110321, "learning_rate": 1.885907798845511e-05, "loss": 0.4435, "num_input_tokens_seen": 5412160, "step": 16075 }, { "epoch": 12.426584234930449, "grad_norm": 0.8442777991294861, "learning_rate": 1.8842736112330206e-05, "loss": 0.5012, "num_input_tokens_seen": 5413696, "step": 16080 }, { "epoch": 12.430448222565687, "grad_norm": 0.9342008233070374, "learning_rate": 1.8826397036565797e-05, "loss": 0.5253, "num_input_tokens_seen": 5415328, "step": 16085 }, { "epoch": 12.434312210200927, "grad_norm": 1.4410828351974487, "learning_rate": 1.8810060768592992e-05, "loss": 0.4594, "num_input_tokens_seen": 5416704, "step": 16090 }, { "epoch": 12.438176197836167, "grad_norm": 1.1614561080932617, "learning_rate": 1.8793727315841608e-05, "loss": 0.5125, "num_input_tokens_seen": 5418272, "step": 16095 }, { "epoch": 12.442040185471406, "grad_norm": 1.0579508543014526, "learning_rate": 1.877739668574022e-05, "loss": 0.4492, "num_input_tokens_seen": 5419936, "step": 16100 }, { "epoch": 12.445904173106646, "grad_norm": 0.6575266718864441, "learning_rate": 1.876106888571607e-05, "loss": 0.4327, "num_input_tokens_seen": 5421696, "step": 16105 }, { "epoch": 12.449768160741886, "grad_norm": 1.0587490797042847, "learning_rate": 1.8744743923195166e-05, "loss": 0.5038, "num_input_tokens_seen": 5423488, "step": 16110 }, { "epoch": 12.453632148377125, "grad_norm": 0.7954670190811157, "learning_rate": 1.872842180560218e-05, "loss": 0.559, "num_input_tokens_seen": 5425088, "step": 16115 }, { "epoch": 12.457496136012365, "grad_norm": 1.3876866102218628, "learning_rate": 1.8712102540360527e-05, "loss": 0.3655, "num_input_tokens_seen": 5426752, "step": 16120 }, { "epoch": 12.461360123647605, "grad_norm": 0.8256262540817261, "learning_rate": 1.869578613489229e-05, "loss": 0.465, "num_input_tokens_seen": 5428384, "step": 16125 }, { "epoch": 12.465224111282843, "grad_norm": 0.8115943670272827, "learning_rate": 1.8679472596618268e-05, "loss": 0.7049, "num_input_tokens_seen": 5429952, "step": 16130 }, { "epoch": 12.469088098918084, "grad_norm": 0.7475764751434326, "learning_rate": 1.8663161932957966e-05, "loss": 0.3156, "num_input_tokens_seen": 5431776, "step": 16135 }, { "epoch": 12.472952086553324, "grad_norm": 0.8259214162826538, "learning_rate": 1.8646854151329575e-05, "loss": 0.444, "num_input_tokens_seen": 5433664, "step": 16140 }, { "epoch": 12.476816074188562, "grad_norm": 1.4651824235916138, "learning_rate": 1.863054925914995e-05, "loss": 0.6416, "num_input_tokens_seen": 5435296, "step": 16145 }, { "epoch": 12.480680061823803, "grad_norm": 1.0618172883987427, "learning_rate": 1.861424726383466e-05, "loss": 0.4534, "num_input_tokens_seen": 5436864, "step": 16150 }, { "epoch": 12.484544049459041, "grad_norm": 0.9075713753700256, "learning_rate": 1.8597948172797975e-05, "loss": 0.4314, "num_input_tokens_seen": 5438336, "step": 16155 }, { "epoch": 12.488408037094281, "grad_norm": 0.6276659369468689, "learning_rate": 1.85816519934528e-05, "loss": 0.5476, "num_input_tokens_seen": 5440064, "step": 16160 }, { "epoch": 12.492272024729521, "grad_norm": 1.0740056037902832, "learning_rate": 1.8565358733210725e-05, "loss": 0.3687, "num_input_tokens_seen": 5441568, "step": 16165 }, { "epoch": 12.49613601236476, "grad_norm": 0.963576078414917, "learning_rate": 1.8549068399482043e-05, "loss": 0.4452, "num_input_tokens_seen": 5443296, "step": 16170 }, { "epoch": 12.5, "grad_norm": 1.521815538406372, "learning_rate": 1.8532780999675686e-05, "loss": 0.4643, "num_input_tokens_seen": 5444768, "step": 16175 }, { "epoch": 12.50386398763524, "grad_norm": 0.8995427489280701, "learning_rate": 1.8516496541199257e-05, "loss": 0.4269, "num_input_tokens_seen": 5446272, "step": 16180 }, { "epoch": 12.507727975270479, "grad_norm": 1.2289974689483643, "learning_rate": 1.8500215031459035e-05, "loss": 0.4475, "num_input_tokens_seen": 5448032, "step": 16185 }, { "epoch": 12.511591962905719, "grad_norm": 1.4149394035339355, "learning_rate": 1.8483936477859932e-05, "loss": 0.4681, "num_input_tokens_seen": 5449760, "step": 16190 }, { "epoch": 12.515455950540959, "grad_norm": 0.8052827715873718, "learning_rate": 1.846766088780555e-05, "loss": 0.409, "num_input_tokens_seen": 5451552, "step": 16195 }, { "epoch": 12.519319938176197, "grad_norm": 0.7462022304534912, "learning_rate": 1.845138826869811e-05, "loss": 0.3849, "num_input_tokens_seen": 5453664, "step": 16200 }, { "epoch": 12.523183925811438, "grad_norm": 0.8091526627540588, "learning_rate": 1.8435118627938512e-05, "loss": 0.4343, "num_input_tokens_seen": 5455552, "step": 16205 }, { "epoch": 12.527047913446676, "grad_norm": 1.1019734144210815, "learning_rate": 1.8418851972926275e-05, "loss": 0.6711, "num_input_tokens_seen": 5457312, "step": 16210 }, { "epoch": 12.530911901081916, "grad_norm": 0.809403657913208, "learning_rate": 1.840258831105957e-05, "loss": 0.4041, "num_input_tokens_seen": 5459232, "step": 16215 }, { "epoch": 12.534775888717157, "grad_norm": 0.8267434239387512, "learning_rate": 1.8386327649735217e-05, "loss": 0.4447, "num_input_tokens_seen": 5460832, "step": 16220 }, { "epoch": 12.538639876352395, "grad_norm": 1.6880290508270264, "learning_rate": 1.8370069996348658e-05, "loss": 0.5255, "num_input_tokens_seen": 5462432, "step": 16225 }, { "epoch": 12.542503863987635, "grad_norm": 1.8351819515228271, "learning_rate": 1.835381535829396e-05, "loss": 0.6692, "num_input_tokens_seen": 5464480, "step": 16230 }, { "epoch": 12.546367851622875, "grad_norm": 1.0437183380126953, "learning_rate": 1.833756374296384e-05, "loss": 0.5394, "num_input_tokens_seen": 5466464, "step": 16235 }, { "epoch": 12.550231839258114, "grad_norm": 0.6463442444801331, "learning_rate": 1.8321315157749635e-05, "loss": 0.456, "num_input_tokens_seen": 5468256, "step": 16240 }, { "epoch": 12.554095826893354, "grad_norm": 1.484784483909607, "learning_rate": 1.8305069610041298e-05, "loss": 0.5347, "num_input_tokens_seen": 5470048, "step": 16245 }, { "epoch": 12.557959814528594, "grad_norm": 0.7511551380157471, "learning_rate": 1.828882710722739e-05, "loss": 0.4781, "num_input_tokens_seen": 5471648, "step": 16250 }, { "epoch": 12.561823802163833, "grad_norm": 0.6835362315177917, "learning_rate": 1.8272587656695106e-05, "loss": 0.3918, "num_input_tokens_seen": 5473504, "step": 16255 }, { "epoch": 12.565687789799073, "grad_norm": 0.9419705867767334, "learning_rate": 1.8256351265830248e-05, "loss": 0.4271, "num_input_tokens_seen": 5475200, "step": 16260 }, { "epoch": 12.569551777434313, "grad_norm": 0.7645039558410645, "learning_rate": 1.8240117942017214e-05, "loss": 0.3975, "num_input_tokens_seen": 5476768, "step": 16265 }, { "epoch": 12.573415765069551, "grad_norm": 0.6855732798576355, "learning_rate": 1.8223887692639022e-05, "loss": 0.5666, "num_input_tokens_seen": 5478240, "step": 16270 }, { "epoch": 12.577279752704792, "grad_norm": 0.8989469408988953, "learning_rate": 1.82076605250773e-05, "loss": 0.3896, "num_input_tokens_seen": 5480128, "step": 16275 }, { "epoch": 12.58114374034003, "grad_norm": 1.0907318592071533, "learning_rate": 1.819143644671224e-05, "loss": 0.4607, "num_input_tokens_seen": 5481632, "step": 16280 }, { "epoch": 12.58500772797527, "grad_norm": 1.5172581672668457, "learning_rate": 1.8175215464922655e-05, "loss": 0.4802, "num_input_tokens_seen": 5483104, "step": 16285 }, { "epoch": 12.58887171561051, "grad_norm": 0.6445057988166809, "learning_rate": 1.815899758708596e-05, "loss": 0.4427, "num_input_tokens_seen": 5484480, "step": 16290 }, { "epoch": 12.592735703245749, "grad_norm": 1.1321065425872803, "learning_rate": 1.814278282057813e-05, "loss": 0.5188, "num_input_tokens_seen": 5486016, "step": 16295 }, { "epoch": 12.59659969088099, "grad_norm": 1.111307144165039, "learning_rate": 1.8126571172773733e-05, "loss": 0.4046, "num_input_tokens_seen": 5487744, "step": 16300 }, { "epoch": 12.60046367851623, "grad_norm": 0.8666423559188843, "learning_rate": 1.8110362651045933e-05, "loss": 0.3804, "num_input_tokens_seen": 5489344, "step": 16305 }, { "epoch": 12.604327666151468, "grad_norm": 1.165993571281433, "learning_rate": 1.8094157262766452e-05, "loss": 0.5337, "num_input_tokens_seen": 5490848, "step": 16310 }, { "epoch": 12.608191653786708, "grad_norm": 0.7072194218635559, "learning_rate": 1.8077955015305613e-05, "loss": 0.338, "num_input_tokens_seen": 5492672, "step": 16315 }, { "epoch": 12.612055641421948, "grad_norm": 0.7712292075157166, "learning_rate": 1.8061755916032286e-05, "loss": 0.4655, "num_input_tokens_seen": 5494496, "step": 16320 }, { "epoch": 12.615919629057187, "grad_norm": 0.6511956453323364, "learning_rate": 1.8045559972313925e-05, "loss": 0.5087, "num_input_tokens_seen": 5496128, "step": 16325 }, { "epoch": 12.619783616692427, "grad_norm": 1.2191543579101562, "learning_rate": 1.8029367191516535e-05, "loss": 0.4214, "num_input_tokens_seen": 5497824, "step": 16330 }, { "epoch": 12.623647604327665, "grad_norm": 2.0926687717437744, "learning_rate": 1.8013177581004685e-05, "loss": 0.6692, "num_input_tokens_seen": 5499392, "step": 16335 }, { "epoch": 12.627511591962906, "grad_norm": 0.6716405749320984, "learning_rate": 1.7996991148141522e-05, "loss": 0.337, "num_input_tokens_seen": 5500960, "step": 16340 }, { "epoch": 12.631375579598146, "grad_norm": 1.0199835300445557, "learning_rate": 1.7980807900288726e-05, "loss": 0.3923, "num_input_tokens_seen": 5502816, "step": 16345 }, { "epoch": 12.635239567233384, "grad_norm": 0.9031262993812561, "learning_rate": 1.796462784480652e-05, "loss": 0.3493, "num_input_tokens_seen": 5504672, "step": 16350 }, { "epoch": 12.639103554868624, "grad_norm": 1.335119366645813, "learning_rate": 1.7948450989053707e-05, "loss": 0.4029, "num_input_tokens_seen": 5506208, "step": 16355 }, { "epoch": 12.642967542503865, "grad_norm": 0.8271121382713318, "learning_rate": 1.793227734038762e-05, "loss": 0.4051, "num_input_tokens_seen": 5507776, "step": 16360 }, { "epoch": 12.646831530139103, "grad_norm": 0.6445081233978271, "learning_rate": 1.791610690616413e-05, "loss": 0.4002, "num_input_tokens_seen": 5509760, "step": 16365 }, { "epoch": 12.650695517774343, "grad_norm": 0.8285342454910278, "learning_rate": 1.7899939693737634e-05, "loss": 0.4377, "num_input_tokens_seen": 5511488, "step": 16370 }, { "epoch": 12.654559505409583, "grad_norm": 1.5477291345596313, "learning_rate": 1.7883775710461093e-05, "loss": 0.3739, "num_input_tokens_seen": 5512896, "step": 16375 }, { "epoch": 12.658423493044822, "grad_norm": 1.1056462526321411, "learning_rate": 1.7867614963685976e-05, "loss": 0.4346, "num_input_tokens_seen": 5514656, "step": 16380 }, { "epoch": 12.662287480680062, "grad_norm": 0.9550122618675232, "learning_rate": 1.7851457460762277e-05, "loss": 0.4237, "num_input_tokens_seen": 5516352, "step": 16385 }, { "epoch": 12.666151468315302, "grad_norm": 0.5256395936012268, "learning_rate": 1.7835303209038536e-05, "loss": 0.385, "num_input_tokens_seen": 5518208, "step": 16390 }, { "epoch": 12.67001545595054, "grad_norm": 1.3322283029556274, "learning_rate": 1.7819152215861812e-05, "loss": 0.5248, "num_input_tokens_seen": 5519840, "step": 16395 }, { "epoch": 12.673879443585781, "grad_norm": 0.6824471354484558, "learning_rate": 1.7803004488577667e-05, "loss": 0.4897, "num_input_tokens_seen": 5521600, "step": 16400 }, { "epoch": 12.67774343122102, "grad_norm": 0.7861367464065552, "learning_rate": 1.7786860034530174e-05, "loss": 0.4373, "num_input_tokens_seen": 5523328, "step": 16405 }, { "epoch": 12.68160741885626, "grad_norm": 1.0187410116195679, "learning_rate": 1.7770718861061942e-05, "loss": 0.4812, "num_input_tokens_seen": 5525056, "step": 16410 }, { "epoch": 12.6854714064915, "grad_norm": 0.9612721800804138, "learning_rate": 1.7754580975514062e-05, "loss": 0.5657, "num_input_tokens_seen": 5526592, "step": 16415 }, { "epoch": 12.689335394126738, "grad_norm": 1.052965760231018, "learning_rate": 1.7738446385226145e-05, "loss": 0.3568, "num_input_tokens_seen": 5528544, "step": 16420 }, { "epoch": 12.693199381761978, "grad_norm": 1.7081018686294556, "learning_rate": 1.7722315097536304e-05, "loss": 0.4804, "num_input_tokens_seen": 5530176, "step": 16425 }, { "epoch": 12.697063369397219, "grad_norm": 1.999492883682251, "learning_rate": 1.7706187119781132e-05, "loss": 0.6591, "num_input_tokens_seen": 5531840, "step": 16430 }, { "epoch": 12.700927357032457, "grad_norm": 1.2119868993759155, "learning_rate": 1.7690062459295746e-05, "loss": 0.5345, "num_input_tokens_seen": 5533280, "step": 16435 }, { "epoch": 12.704791344667697, "grad_norm": 1.0280240774154663, "learning_rate": 1.7673941123413726e-05, "loss": 0.3932, "num_input_tokens_seen": 5535072, "step": 16440 }, { "epoch": 12.708655332302937, "grad_norm": 0.5230708718299866, "learning_rate": 1.7657823119467165e-05, "loss": 0.4527, "num_input_tokens_seen": 5537120, "step": 16445 }, { "epoch": 12.712519319938176, "grad_norm": 1.6927002668380737, "learning_rate": 1.7641708454786615e-05, "loss": 0.4522, "num_input_tokens_seen": 5538688, "step": 16450 }, { "epoch": 12.716383307573416, "grad_norm": 1.3639222383499146, "learning_rate": 1.7625597136701127e-05, "loss": 0.4422, "num_input_tokens_seen": 5540448, "step": 16455 }, { "epoch": 12.720247295208654, "grad_norm": 1.3351000547409058, "learning_rate": 1.760948917253823e-05, "loss": 0.5259, "num_input_tokens_seen": 5542272, "step": 16460 }, { "epoch": 12.724111282843895, "grad_norm": 0.8136488199234009, "learning_rate": 1.7593384569623914e-05, "loss": 0.3733, "num_input_tokens_seen": 5543936, "step": 16465 }, { "epoch": 12.727975270479135, "grad_norm": 1.264346718788147, "learning_rate": 1.757728333528264e-05, "loss": 0.4472, "num_input_tokens_seen": 5545568, "step": 16470 }, { "epoch": 12.731839258114373, "grad_norm": 1.3721922636032104, "learning_rate": 1.756118547683737e-05, "loss": 0.4714, "num_input_tokens_seen": 5547104, "step": 16475 }, { "epoch": 12.735703245749614, "grad_norm": 1.651692271232605, "learning_rate": 1.7545091001609496e-05, "loss": 0.4516, "num_input_tokens_seen": 5548672, "step": 16480 }, { "epoch": 12.739567233384854, "grad_norm": 0.7961933016777039, "learning_rate": 1.752899991691888e-05, "loss": 0.4362, "num_input_tokens_seen": 5550400, "step": 16485 }, { "epoch": 12.743431221020092, "grad_norm": 1.3075714111328125, "learning_rate": 1.7512912230083838e-05, "loss": 0.4818, "num_input_tokens_seen": 5552096, "step": 16490 }, { "epoch": 12.747295208655332, "grad_norm": 0.9906137585639954, "learning_rate": 1.7496827948421157e-05, "loss": 0.375, "num_input_tokens_seen": 5553792, "step": 16495 }, { "epoch": 12.751159196290573, "grad_norm": 1.3424949645996094, "learning_rate": 1.7480747079246063e-05, "loss": 0.3508, "num_input_tokens_seen": 5555424, "step": 16500 }, { "epoch": 12.755023183925811, "grad_norm": 0.9362058639526367, "learning_rate": 1.746466962987222e-05, "loss": 0.3938, "num_input_tokens_seen": 5557120, "step": 16505 }, { "epoch": 12.758887171561051, "grad_norm": 0.7324134111404419, "learning_rate": 1.7448595607611753e-05, "loss": 0.397, "num_input_tokens_seen": 5558656, "step": 16510 }, { "epoch": 12.762751159196291, "grad_norm": 0.9228474497795105, "learning_rate": 1.7432525019775236e-05, "loss": 0.325, "num_input_tokens_seen": 5560288, "step": 16515 }, { "epoch": 12.76661514683153, "grad_norm": 1.2694276571273804, "learning_rate": 1.7416457873671663e-05, "loss": 0.4601, "num_input_tokens_seen": 5562144, "step": 16520 }, { "epoch": 12.77047913446677, "grad_norm": 1.314159870147705, "learning_rate": 1.7400394176608457e-05, "loss": 0.4136, "num_input_tokens_seen": 5563840, "step": 16525 }, { "epoch": 12.774343122102009, "grad_norm": 1.3325920104980469, "learning_rate": 1.73843339358915e-05, "loss": 0.4037, "num_input_tokens_seen": 5565600, "step": 16530 }, { "epoch": 12.778207109737249, "grad_norm": 0.6491349935531616, "learning_rate": 1.7368277158825076e-05, "loss": 0.3991, "num_input_tokens_seen": 5567360, "step": 16535 }, { "epoch": 12.782071097372489, "grad_norm": 1.3045471906661987, "learning_rate": 1.7352223852711896e-05, "loss": 0.6453, "num_input_tokens_seen": 5568800, "step": 16540 }, { "epoch": 12.785935085007727, "grad_norm": 0.8813719749450684, "learning_rate": 1.733617402485312e-05, "loss": 0.4368, "num_input_tokens_seen": 5570240, "step": 16545 }, { "epoch": 12.789799072642968, "grad_norm": 1.2809761762619019, "learning_rate": 1.7320127682548277e-05, "loss": 0.4176, "num_input_tokens_seen": 5571552, "step": 16550 }, { "epoch": 12.793663060278208, "grad_norm": 1.1456822156906128, "learning_rate": 1.730408483309537e-05, "loss": 0.4025, "num_input_tokens_seen": 5573184, "step": 16555 }, { "epoch": 12.797527047913446, "grad_norm": 1.205998182296753, "learning_rate": 1.7288045483790766e-05, "loss": 0.4864, "num_input_tokens_seen": 5574880, "step": 16560 }, { "epoch": 12.801391035548686, "grad_norm": 0.7055379152297974, "learning_rate": 1.7272009641929267e-05, "loss": 0.3904, "num_input_tokens_seen": 5576608, "step": 16565 }, { "epoch": 12.805255023183927, "grad_norm": 1.6406978368759155, "learning_rate": 1.7255977314804063e-05, "loss": 0.4701, "num_input_tokens_seen": 5578368, "step": 16570 }, { "epoch": 12.809119010819165, "grad_norm": 0.8987976908683777, "learning_rate": 1.723994850970675e-05, "loss": 0.4222, "num_input_tokens_seen": 5580320, "step": 16575 }, { "epoch": 12.812982998454405, "grad_norm": 0.7055689692497253, "learning_rate": 1.722392323392733e-05, "loss": 0.4053, "num_input_tokens_seen": 5581728, "step": 16580 }, { "epoch": 12.816846986089644, "grad_norm": 0.9751905798912048, "learning_rate": 1.7207901494754192e-05, "loss": 0.402, "num_input_tokens_seen": 5583200, "step": 16585 }, { "epoch": 12.820710973724884, "grad_norm": 0.5211448669433594, "learning_rate": 1.719188329947411e-05, "loss": 0.3201, "num_input_tokens_seen": 5584992, "step": 16590 }, { "epoch": 12.824574961360124, "grad_norm": 1.1720983982086182, "learning_rate": 1.717586865537227e-05, "loss": 0.5469, "num_input_tokens_seen": 5586592, "step": 16595 }, { "epoch": 12.828438948995363, "grad_norm": 1.1320469379425049, "learning_rate": 1.715985756973223e-05, "loss": 0.3982, "num_input_tokens_seen": 5588544, "step": 16600 }, { "epoch": 12.832302936630603, "grad_norm": 0.762525737285614, "learning_rate": 1.7143850049835915e-05, "loss": 0.4504, "num_input_tokens_seen": 5590144, "step": 16605 }, { "epoch": 12.836166924265843, "grad_norm": 1.2484321594238281, "learning_rate": 1.7127846102963646e-05, "loss": 0.4116, "num_input_tokens_seen": 5591680, "step": 16610 }, { "epoch": 12.840030911901081, "grad_norm": 0.7451475858688354, "learning_rate": 1.7111845736394118e-05, "loss": 0.3345, "num_input_tokens_seen": 5593408, "step": 16615 }, { "epoch": 12.843894899536322, "grad_norm": 0.9440672397613525, "learning_rate": 1.7095848957404384e-05, "loss": 0.4123, "num_input_tokens_seen": 5595072, "step": 16620 }, { "epoch": 12.847758887171562, "grad_norm": 0.6632667183876038, "learning_rate": 1.707985577326988e-05, "loss": 0.3785, "num_input_tokens_seen": 5596704, "step": 16625 }, { "epoch": 12.8516228748068, "grad_norm": 0.9827655553817749, "learning_rate": 1.7063866191264398e-05, "loss": 0.8376, "num_input_tokens_seen": 5598336, "step": 16630 }, { "epoch": 12.85548686244204, "grad_norm": 0.8807010650634766, "learning_rate": 1.7047880218660107e-05, "loss": 0.4205, "num_input_tokens_seen": 5599936, "step": 16635 }, { "epoch": 12.85935085007728, "grad_norm": 1.2883942127227783, "learning_rate": 1.7031897862727513e-05, "loss": 0.4135, "num_input_tokens_seen": 5601696, "step": 16640 }, { "epoch": 12.863214837712519, "grad_norm": 0.907592236995697, "learning_rate": 1.7015919130735493e-05, "loss": 0.4004, "num_input_tokens_seen": 5603552, "step": 16645 }, { "epoch": 12.86707882534776, "grad_norm": 1.0294463634490967, "learning_rate": 1.6999944029951265e-05, "loss": 0.4355, "num_input_tokens_seen": 5605056, "step": 16650 }, { "epoch": 12.870942812982998, "grad_norm": 1.1462523937225342, "learning_rate": 1.698397256764041e-05, "loss": 0.5544, "num_input_tokens_seen": 5607104, "step": 16655 }, { "epoch": 12.874806800618238, "grad_norm": 1.1303378343582153, "learning_rate": 1.6968004751066823e-05, "loss": 0.5772, "num_input_tokens_seen": 5608672, "step": 16660 }, { "epoch": 12.878670788253478, "grad_norm": 1.451472282409668, "learning_rate": 1.695204058749279e-05, "loss": 0.499, "num_input_tokens_seen": 5610240, "step": 16665 }, { "epoch": 12.882534775888717, "grad_norm": 0.9680337905883789, "learning_rate": 1.693608008417888e-05, "loss": 0.5334, "num_input_tokens_seen": 5612128, "step": 16670 }, { "epoch": 12.886398763523957, "grad_norm": 1.5814921855926514, "learning_rate": 1.6920123248384054e-05, "loss": 0.5146, "num_input_tokens_seen": 5613824, "step": 16675 }, { "epoch": 12.890262751159197, "grad_norm": 1.581734538078308, "learning_rate": 1.690417008736556e-05, "loss": 0.6466, "num_input_tokens_seen": 5615552, "step": 16680 }, { "epoch": 12.894126738794435, "grad_norm": 1.1179765462875366, "learning_rate": 1.6888220608378992e-05, "loss": 0.458, "num_input_tokens_seen": 5617184, "step": 16685 }, { "epoch": 12.897990726429676, "grad_norm": 1.035158395767212, "learning_rate": 1.6872274818678275e-05, "loss": 0.3448, "num_input_tokens_seen": 5618784, "step": 16690 }, { "epoch": 12.901854714064916, "grad_norm": 0.830230712890625, "learning_rate": 1.6856332725515643e-05, "loss": 0.5063, "num_input_tokens_seen": 5620736, "step": 16695 }, { "epoch": 12.905718701700154, "grad_norm": 0.9581377506256104, "learning_rate": 1.684039433614166e-05, "loss": 0.3758, "num_input_tokens_seen": 5622176, "step": 16700 }, { "epoch": 12.909582689335394, "grad_norm": 0.7681434154510498, "learning_rate": 1.68244596578052e-05, "loss": 0.4493, "num_input_tokens_seen": 5623840, "step": 16705 }, { "epoch": 12.913446676970633, "grad_norm": 0.7929017543792725, "learning_rate": 1.680852869775344e-05, "loss": 0.4453, "num_input_tokens_seen": 5625760, "step": 16710 }, { "epoch": 12.917310664605873, "grad_norm": 0.37542077898979187, "learning_rate": 1.6792601463231892e-05, "loss": 0.6306, "num_input_tokens_seen": 5627648, "step": 16715 }, { "epoch": 12.921174652241113, "grad_norm": 0.9401633739471436, "learning_rate": 1.6776677961484346e-05, "loss": 0.3771, "num_input_tokens_seen": 5629088, "step": 16720 }, { "epoch": 12.925038639876352, "grad_norm": 0.7330148220062256, "learning_rate": 1.676075819975292e-05, "loss": 0.5185, "num_input_tokens_seen": 5630560, "step": 16725 }, { "epoch": 12.928902627511592, "grad_norm": 1.0190457105636597, "learning_rate": 1.6744842185278002e-05, "loss": 0.5017, "num_input_tokens_seen": 5632000, "step": 16730 }, { "epoch": 12.932766615146832, "grad_norm": 0.8226780295372009, "learning_rate": 1.672892992529829e-05, "loss": 0.4277, "num_input_tokens_seen": 5633568, "step": 16735 }, { "epoch": 12.93663060278207, "grad_norm": 1.5745460987091064, "learning_rate": 1.6713021427050795e-05, "loss": 0.5419, "num_input_tokens_seen": 5635328, "step": 16740 }, { "epoch": 12.94049459041731, "grad_norm": 0.9084721803665161, "learning_rate": 1.6697116697770773e-05, "loss": 0.369, "num_input_tokens_seen": 5636800, "step": 16745 }, { "epoch": 12.944358578052551, "grad_norm": 1.0261540412902832, "learning_rate": 1.6681215744691804e-05, "loss": 0.4199, "num_input_tokens_seen": 5638528, "step": 16750 }, { "epoch": 12.94822256568779, "grad_norm": 1.1372257471084595, "learning_rate": 1.666531857504573e-05, "loss": 0.4185, "num_input_tokens_seen": 5640192, "step": 16755 }, { "epoch": 12.95208655332303, "grad_norm": 0.9742651581764221, "learning_rate": 1.664942519606269e-05, "loss": 0.3809, "num_input_tokens_seen": 5641856, "step": 16760 }, { "epoch": 12.95595054095827, "grad_norm": 1.2406384944915771, "learning_rate": 1.6633535614971078e-05, "loss": 0.4022, "num_input_tokens_seen": 5643360, "step": 16765 }, { "epoch": 12.959814528593508, "grad_norm": 0.5054617524147034, "learning_rate": 1.661764983899757e-05, "loss": 0.5571, "num_input_tokens_seen": 5645152, "step": 16770 }, { "epoch": 12.963678516228748, "grad_norm": 0.9310892224311829, "learning_rate": 1.6601767875367118e-05, "loss": 0.5011, "num_input_tokens_seen": 5646976, "step": 16775 }, { "epoch": 12.967542503863987, "grad_norm": 0.7565891742706299, "learning_rate": 1.6585889731302934e-05, "loss": 0.3416, "num_input_tokens_seen": 5648768, "step": 16780 }, { "epoch": 12.971406491499227, "grad_norm": 1.0652869939804077, "learning_rate": 1.6570015414026486e-05, "loss": 0.7835, "num_input_tokens_seen": 5650624, "step": 16785 }, { "epoch": 12.975270479134467, "grad_norm": 0.7398920059204102, "learning_rate": 1.6554144930757504e-05, "loss": 0.5357, "num_input_tokens_seen": 5652384, "step": 16790 }, { "epoch": 12.979134466769706, "grad_norm": 0.9309130311012268, "learning_rate": 1.6538278288714003e-05, "loss": 0.5411, "num_input_tokens_seen": 5653952, "step": 16795 }, { "epoch": 12.982998454404946, "grad_norm": 1.3577675819396973, "learning_rate": 1.652241549511221e-05, "loss": 0.4817, "num_input_tokens_seen": 5655744, "step": 16800 }, { "epoch": 12.986862442040186, "grad_norm": 0.8704668283462524, "learning_rate": 1.650655655716661e-05, "loss": 0.4371, "num_input_tokens_seen": 5657344, "step": 16805 }, { "epoch": 12.990726429675425, "grad_norm": 1.4947055578231812, "learning_rate": 1.649070148208996e-05, "loss": 0.392, "num_input_tokens_seen": 5658848, "step": 16810 }, { "epoch": 12.994590417310665, "grad_norm": 0.882703959941864, "learning_rate": 1.647485027709324e-05, "loss": 0.4907, "num_input_tokens_seen": 5660672, "step": 16815 }, { "epoch": 12.998454404945905, "grad_norm": 1.0624860525131226, "learning_rate": 1.6459002949385662e-05, "loss": 0.4453, "num_input_tokens_seen": 5662272, "step": 16820 }, { "epoch": 13.0, "eval_loss": 0.46614736318588257, "eval_runtime": 6.3604, "eval_samples_per_second": 90.403, "eval_steps_per_second": 22.64, "num_input_tokens_seen": 5662848, "step": 16822 }, { "epoch": 13.002318392581143, "grad_norm": 1.106284260749817, "learning_rate": 1.64431595061747e-05, "loss": 0.4627, "num_input_tokens_seen": 5663808, "step": 16825 }, { "epoch": 13.006182380216384, "grad_norm": 0.8885911107063293, "learning_rate": 1.6427319954666027e-05, "loss": 0.5773, "num_input_tokens_seen": 5665504, "step": 16830 }, { "epoch": 13.010046367851622, "grad_norm": 0.9209770560264587, "learning_rate": 1.6411484302063587e-05, "loss": 0.3989, "num_input_tokens_seen": 5667264, "step": 16835 }, { "epoch": 13.013910355486862, "grad_norm": 2.0346453189849854, "learning_rate": 1.6395652555569518e-05, "loss": 0.6113, "num_input_tokens_seen": 5668768, "step": 16840 }, { "epoch": 13.017774343122102, "grad_norm": 0.7640185952186584, "learning_rate": 1.6379824722384203e-05, "loss": 0.4259, "num_input_tokens_seen": 5670464, "step": 16845 }, { "epoch": 13.021638330757341, "grad_norm": 0.6768211722373962, "learning_rate": 1.6364000809706222e-05, "loss": 0.3838, "num_input_tokens_seen": 5672064, "step": 16850 }, { "epoch": 13.025502318392581, "grad_norm": 0.8654024600982666, "learning_rate": 1.634818082473239e-05, "loss": 0.5102, "num_input_tokens_seen": 5674048, "step": 16855 }, { "epoch": 13.029366306027821, "grad_norm": 1.3188883066177368, "learning_rate": 1.633236477465774e-05, "loss": 0.4219, "num_input_tokens_seen": 5675648, "step": 16860 }, { "epoch": 13.03323029366306, "grad_norm": 1.161341905593872, "learning_rate": 1.63165526666755e-05, "loss": 0.4651, "num_input_tokens_seen": 5677248, "step": 16865 }, { "epoch": 13.0370942812983, "grad_norm": 1.1347768306732178, "learning_rate": 1.6300744507977095e-05, "loss": 0.3568, "num_input_tokens_seen": 5679008, "step": 16870 }, { "epoch": 13.04095826893354, "grad_norm": 1.2300152778625488, "learning_rate": 1.6284940305752195e-05, "loss": 0.6486, "num_input_tokens_seen": 5681088, "step": 16875 }, { "epoch": 13.044822256568779, "grad_norm": 1.1198538541793823, "learning_rate": 1.6269140067188638e-05, "loss": 0.482, "num_input_tokens_seen": 5682848, "step": 16880 }, { "epoch": 13.048686244204019, "grad_norm": 1.2357186079025269, "learning_rate": 1.6253343799472467e-05, "loss": 0.7548, "num_input_tokens_seen": 5684544, "step": 16885 }, { "epoch": 13.052550231839259, "grad_norm": 0.8051734566688538, "learning_rate": 1.6237551509787912e-05, "loss": 0.3904, "num_input_tokens_seen": 5686272, "step": 16890 }, { "epoch": 13.056414219474497, "grad_norm": 1.693877100944519, "learning_rate": 1.6221763205317415e-05, "loss": 0.6758, "num_input_tokens_seen": 5687968, "step": 16895 }, { "epoch": 13.060278207109738, "grad_norm": 1.1159847974777222, "learning_rate": 1.620597889324158e-05, "loss": 0.5193, "num_input_tokens_seen": 5689568, "step": 16900 }, { "epoch": 13.064142194744976, "grad_norm": 1.3683799505233765, "learning_rate": 1.6190198580739206e-05, "loss": 0.4596, "num_input_tokens_seen": 5691296, "step": 16905 }, { "epoch": 13.068006182380216, "grad_norm": 1.9785348176956177, "learning_rate": 1.617442227498727e-05, "loss": 0.5819, "num_input_tokens_seen": 5693056, "step": 16910 }, { "epoch": 13.071870170015456, "grad_norm": 1.13345468044281, "learning_rate": 1.615864998316095e-05, "loss": 0.5581, "num_input_tokens_seen": 5694848, "step": 16915 }, { "epoch": 13.075734157650695, "grad_norm": 1.4341057538986206, "learning_rate": 1.6142881712433566e-05, "loss": 0.7931, "num_input_tokens_seen": 5696768, "step": 16920 }, { "epoch": 13.079598145285935, "grad_norm": 0.7289278507232666, "learning_rate": 1.6127117469976617e-05, "loss": 0.3468, "num_input_tokens_seen": 5698816, "step": 16925 }, { "epoch": 13.083462132921175, "grad_norm": 1.0520963668823242, "learning_rate": 1.6111357262959785e-05, "loss": 0.4833, "num_input_tokens_seen": 5700352, "step": 16930 }, { "epoch": 13.087326120556414, "grad_norm": 0.708512008190155, "learning_rate": 1.60956010985509e-05, "loss": 0.5149, "num_input_tokens_seen": 5702208, "step": 16935 }, { "epoch": 13.091190108191654, "grad_norm": 1.4753082990646362, "learning_rate": 1.607984898391596e-05, "loss": 0.4017, "num_input_tokens_seen": 5704000, "step": 16940 }, { "epoch": 13.095054095826894, "grad_norm": 1.0262582302093506, "learning_rate": 1.6064100926219128e-05, "loss": 0.4334, "num_input_tokens_seen": 5705888, "step": 16945 }, { "epoch": 13.098918083462133, "grad_norm": 0.8522509932518005, "learning_rate": 1.6048356932622696e-05, "loss": 0.4313, "num_input_tokens_seen": 5707552, "step": 16950 }, { "epoch": 13.102782071097373, "grad_norm": 1.6042368412017822, "learning_rate": 1.6032617010287154e-05, "loss": 0.4312, "num_input_tokens_seen": 5709248, "step": 16955 }, { "epoch": 13.106646058732611, "grad_norm": 0.9790276885032654, "learning_rate": 1.601688116637109e-05, "loss": 0.4636, "num_input_tokens_seen": 5711040, "step": 16960 }, { "epoch": 13.110510046367851, "grad_norm": 1.0974615812301636, "learning_rate": 1.600114940803128e-05, "loss": 0.4572, "num_input_tokens_seen": 5712896, "step": 16965 }, { "epoch": 13.114374034003092, "grad_norm": 0.6567896008491516, "learning_rate": 1.5985421742422608e-05, "loss": 0.4243, "num_input_tokens_seen": 5714528, "step": 16970 }, { "epoch": 13.11823802163833, "grad_norm": 1.299623966217041, "learning_rate": 1.596969817669811e-05, "loss": 0.3902, "num_input_tokens_seen": 5716352, "step": 16975 }, { "epoch": 13.12210200927357, "grad_norm": 0.6979085206985474, "learning_rate": 1.5953978718008965e-05, "loss": 0.3901, "num_input_tokens_seen": 5717952, "step": 16980 }, { "epoch": 13.12596599690881, "grad_norm": 0.8482693433761597, "learning_rate": 1.5938263373504475e-05, "loss": 0.3793, "num_input_tokens_seen": 5719808, "step": 16985 }, { "epoch": 13.129829984544049, "grad_norm": 0.7148630023002625, "learning_rate": 1.592255215033206e-05, "loss": 0.4228, "num_input_tokens_seen": 5721280, "step": 16990 }, { "epoch": 13.13369397217929, "grad_norm": 0.730514407157898, "learning_rate": 1.5906845055637293e-05, "loss": 0.4759, "num_input_tokens_seen": 5722912, "step": 16995 }, { "epoch": 13.13755795981453, "grad_norm": 0.6822680830955505, "learning_rate": 1.589114209656386e-05, "loss": 0.3983, "num_input_tokens_seen": 5724416, "step": 17000 }, { "epoch": 13.141421947449768, "grad_norm": 0.5892531871795654, "learning_rate": 1.587544328025355e-05, "loss": 0.4108, "num_input_tokens_seen": 5726048, "step": 17005 }, { "epoch": 13.145285935085008, "grad_norm": 0.8598360419273376, "learning_rate": 1.585974861384628e-05, "loss": 0.4446, "num_input_tokens_seen": 5727712, "step": 17010 }, { "epoch": 13.149149922720248, "grad_norm": 1.1037017107009888, "learning_rate": 1.5844058104480082e-05, "loss": 0.3879, "num_input_tokens_seen": 5729376, "step": 17015 }, { "epoch": 13.153013910355487, "grad_norm": 0.9557453989982605, "learning_rate": 1.5828371759291088e-05, "loss": 0.3659, "num_input_tokens_seen": 5731008, "step": 17020 }, { "epoch": 13.156877897990727, "grad_norm": 1.552025556564331, "learning_rate": 1.5812689585413542e-05, "loss": 0.366, "num_input_tokens_seen": 5732544, "step": 17025 }, { "epoch": 13.160741885625965, "grad_norm": 1.1141029596328735, "learning_rate": 1.5797011589979788e-05, "loss": 0.3844, "num_input_tokens_seen": 5734016, "step": 17030 }, { "epoch": 13.164605873261205, "grad_norm": 1.0040457248687744, "learning_rate": 1.5781337780120287e-05, "loss": 0.4112, "num_input_tokens_seen": 5735488, "step": 17035 }, { "epoch": 13.168469860896446, "grad_norm": 0.7835976481437683, "learning_rate": 1.5765668162963572e-05, "loss": 0.3763, "num_input_tokens_seen": 5737216, "step": 17040 }, { "epoch": 13.172333848531684, "grad_norm": 0.582658588886261, "learning_rate": 1.5750002745636275e-05, "loss": 0.3463, "num_input_tokens_seen": 5738656, "step": 17045 }, { "epoch": 13.176197836166924, "grad_norm": 1.2109501361846924, "learning_rate": 1.573434153526313e-05, "loss": 0.4107, "num_input_tokens_seen": 5740224, "step": 17050 }, { "epoch": 13.180061823802165, "grad_norm": 0.8336194157600403, "learning_rate": 1.5718684538966944e-05, "loss": 0.4336, "num_input_tokens_seen": 5741792, "step": 17055 }, { "epoch": 13.183925811437403, "grad_norm": 2.0999608039855957, "learning_rate": 1.570303176386861e-05, "loss": 0.8736, "num_input_tokens_seen": 5743488, "step": 17060 }, { "epoch": 13.187789799072643, "grad_norm": 0.9405179023742676, "learning_rate": 1.568738321708711e-05, "loss": 0.3517, "num_input_tokens_seen": 5745024, "step": 17065 }, { "epoch": 13.191653786707883, "grad_norm": 0.886135458946228, "learning_rate": 1.567173890573949e-05, "loss": 0.3959, "num_input_tokens_seen": 5746848, "step": 17070 }, { "epoch": 13.195517774343122, "grad_norm": 1.1435012817382812, "learning_rate": 1.5656098836940877e-05, "loss": 0.3892, "num_input_tokens_seen": 5748416, "step": 17075 }, { "epoch": 13.199381761978362, "grad_norm": 1.9332623481750488, "learning_rate": 1.5640463017804476e-05, "loss": 0.4909, "num_input_tokens_seen": 5750208, "step": 17080 }, { "epoch": 13.2032457496136, "grad_norm": 0.6912783980369568, "learning_rate": 1.562483145544155e-05, "loss": 0.3827, "num_input_tokens_seen": 5751936, "step": 17085 }, { "epoch": 13.20710973724884, "grad_norm": 0.7428233027458191, "learning_rate": 1.560920415696142e-05, "loss": 0.4233, "num_input_tokens_seen": 5753600, "step": 17090 }, { "epoch": 13.21097372488408, "grad_norm": 0.8769023418426514, "learning_rate": 1.559358112947148e-05, "loss": 0.5055, "num_input_tokens_seen": 5755552, "step": 17095 }, { "epoch": 13.21483771251932, "grad_norm": 0.8310760855674744, "learning_rate": 1.5577962380077177e-05, "loss": 0.4178, "num_input_tokens_seen": 5757312, "step": 17100 }, { "epoch": 13.21870170015456, "grad_norm": 1.0098016262054443, "learning_rate": 1.556234791588201e-05, "loss": 0.4138, "num_input_tokens_seen": 5758816, "step": 17105 }, { "epoch": 13.2225656877898, "grad_norm": 0.8614693284034729, "learning_rate": 1.5546737743987526e-05, "loss": 0.4372, "num_input_tokens_seen": 5760384, "step": 17110 }, { "epoch": 13.226429675425038, "grad_norm": 1.322795033454895, "learning_rate": 1.5531131871493327e-05, "loss": 0.3937, "num_input_tokens_seen": 5761952, "step": 17115 }, { "epoch": 13.230293663060278, "grad_norm": 0.9843089580535889, "learning_rate": 1.5515530305497065e-05, "loss": 0.5263, "num_input_tokens_seen": 5763616, "step": 17120 }, { "epoch": 13.234157650695519, "grad_norm": 0.7313054203987122, "learning_rate": 1.5499933053094425e-05, "loss": 0.4147, "num_input_tokens_seen": 5765728, "step": 17125 }, { "epoch": 13.238021638330757, "grad_norm": 1.1220581531524658, "learning_rate": 1.5484340121379116e-05, "loss": 0.3963, "num_input_tokens_seen": 5767424, "step": 17130 }, { "epoch": 13.241885625965997, "grad_norm": 1.4027061462402344, "learning_rate": 1.5468751517442913e-05, "loss": 0.3633, "num_input_tokens_seen": 5769344, "step": 17135 }, { "epoch": 13.245749613601237, "grad_norm": 0.6619519591331482, "learning_rate": 1.5453167248375606e-05, "loss": 0.3307, "num_input_tokens_seen": 5770976, "step": 17140 }, { "epoch": 13.249613601236476, "grad_norm": 0.8944388031959534, "learning_rate": 1.5437587321264995e-05, "loss": 0.3829, "num_input_tokens_seen": 5772672, "step": 17145 }, { "epoch": 13.253477588871716, "grad_norm": 0.6774348020553589, "learning_rate": 1.542201174319695e-05, "loss": 0.3766, "num_input_tokens_seen": 5774496, "step": 17150 }, { "epoch": 13.257341576506954, "grad_norm": 0.8647093772888184, "learning_rate": 1.5406440521255312e-05, "loss": 0.3864, "num_input_tokens_seen": 5776192, "step": 17155 }, { "epoch": 13.261205564142195, "grad_norm": 0.7133273482322693, "learning_rate": 1.5390873662521983e-05, "loss": 0.3391, "num_input_tokens_seen": 5777792, "step": 17160 }, { "epoch": 13.265069551777435, "grad_norm": 0.8550540208816528, "learning_rate": 1.5375311174076863e-05, "loss": 0.3769, "num_input_tokens_seen": 5779680, "step": 17165 }, { "epoch": 13.268933539412673, "grad_norm": 0.8879263401031494, "learning_rate": 1.5359753062997858e-05, "loss": 0.475, "num_input_tokens_seen": 5781312, "step": 17170 }, { "epoch": 13.272797527047913, "grad_norm": 0.7897288203239441, "learning_rate": 1.53441993363609e-05, "loss": 0.3846, "num_input_tokens_seen": 5782848, "step": 17175 }, { "epoch": 13.276661514683154, "grad_norm": 0.8791887164115906, "learning_rate": 1.5328650001239898e-05, "loss": 0.4512, "num_input_tokens_seen": 5784832, "step": 17180 }, { "epoch": 13.280525502318392, "grad_norm": 0.8673766851425171, "learning_rate": 1.5313105064706803e-05, "loss": 0.4309, "num_input_tokens_seen": 5786624, "step": 17185 }, { "epoch": 13.284389489953632, "grad_norm": 0.6558999419212341, "learning_rate": 1.5297564533831536e-05, "loss": 0.4363, "num_input_tokens_seen": 5788160, "step": 17190 }, { "epoch": 13.288253477588873, "grad_norm": 1.2176003456115723, "learning_rate": 1.528202841568202e-05, "loss": 0.4079, "num_input_tokens_seen": 5789696, "step": 17195 }, { "epoch": 13.292117465224111, "grad_norm": 0.8696337938308716, "learning_rate": 1.526649671732418e-05, "loss": 0.3935, "num_input_tokens_seen": 5791296, "step": 17200 }, { "epoch": 13.295981452859351, "grad_norm": 1.0326435565948486, "learning_rate": 1.5250969445821928e-05, "loss": 0.4446, "num_input_tokens_seen": 5792928, "step": 17205 }, { "epoch": 13.29984544049459, "grad_norm": 1.4523934125900269, "learning_rate": 1.523544660823716e-05, "loss": 0.5017, "num_input_tokens_seen": 5794400, "step": 17210 }, { "epoch": 13.30370942812983, "grad_norm": 1.4469550848007202, "learning_rate": 1.5219928211629747e-05, "loss": 0.392, "num_input_tokens_seen": 5796256, "step": 17215 }, { "epoch": 13.30757341576507, "grad_norm": 0.7061772346496582, "learning_rate": 1.5204414263057559e-05, "loss": 0.3486, "num_input_tokens_seen": 5797888, "step": 17220 }, { "epoch": 13.311437403400308, "grad_norm": 1.0667858123779297, "learning_rate": 1.5188904769576423e-05, "loss": 0.3496, "num_input_tokens_seen": 5799680, "step": 17225 }, { "epoch": 13.315301391035549, "grad_norm": 0.9085558652877808, "learning_rate": 1.5173399738240154e-05, "loss": 0.6566, "num_input_tokens_seen": 5801536, "step": 17230 }, { "epoch": 13.319165378670789, "grad_norm": 1.0813021659851074, "learning_rate": 1.5157899176100526e-05, "loss": 0.3582, "num_input_tokens_seen": 5803136, "step": 17235 }, { "epoch": 13.323029366306027, "grad_norm": 0.8576024770736694, "learning_rate": 1.5142403090207307e-05, "loss": 0.3462, "num_input_tokens_seen": 5804704, "step": 17240 }, { "epoch": 13.326893353941268, "grad_norm": 0.810297429561615, "learning_rate": 1.5126911487608198e-05, "loss": 0.4074, "num_input_tokens_seen": 5806720, "step": 17245 }, { "epoch": 13.330757341576508, "grad_norm": 0.6286483407020569, "learning_rate": 1.5111424375348866e-05, "loss": 0.3474, "num_input_tokens_seen": 5808224, "step": 17250 }, { "epoch": 13.334621329211746, "grad_norm": 2.5678482055664062, "learning_rate": 1.5095941760472947e-05, "loss": 0.7205, "num_input_tokens_seen": 5810144, "step": 17255 }, { "epoch": 13.338485316846986, "grad_norm": 1.0623186826705933, "learning_rate": 1.5080463650022036e-05, "loss": 0.5153, "num_input_tokens_seen": 5811904, "step": 17260 }, { "epoch": 13.342349304482227, "grad_norm": 0.5807892084121704, "learning_rate": 1.5064990051035654e-05, "loss": 0.3678, "num_input_tokens_seen": 5813632, "step": 17265 }, { "epoch": 13.346213292117465, "grad_norm": 0.8142113089561462, "learning_rate": 1.5049520970551307e-05, "loss": 0.4955, "num_input_tokens_seen": 5815456, "step": 17270 }, { "epoch": 13.350077279752705, "grad_norm": 0.9465206265449524, "learning_rate": 1.5034056415604397e-05, "loss": 0.4059, "num_input_tokens_seen": 5817056, "step": 17275 }, { "epoch": 13.353941267387944, "grad_norm": 1.1056160926818848, "learning_rate": 1.5018596393228323e-05, "loss": 0.5137, "num_input_tokens_seen": 5818656, "step": 17280 }, { "epoch": 13.357805255023184, "grad_norm": 1.3500885963439941, "learning_rate": 1.5003140910454383e-05, "loss": 0.5075, "num_input_tokens_seen": 5820576, "step": 17285 }, { "epoch": 13.361669242658424, "grad_norm": 1.9650789499282837, "learning_rate": 1.498768997431183e-05, "loss": 0.6029, "num_input_tokens_seen": 5822016, "step": 17290 }, { "epoch": 13.365533230293662, "grad_norm": 0.731957197189331, "learning_rate": 1.4972243591827839e-05, "loss": 0.333, "num_input_tokens_seen": 5823904, "step": 17295 }, { "epoch": 13.369397217928903, "grad_norm": 0.608772873878479, "learning_rate": 1.495680177002751e-05, "loss": 0.5708, "num_input_tokens_seen": 5825600, "step": 17300 }, { "epoch": 13.373261205564143, "grad_norm": 0.6083573698997498, "learning_rate": 1.4941364515933886e-05, "loss": 0.3908, "num_input_tokens_seen": 5827456, "step": 17305 }, { "epoch": 13.377125193199381, "grad_norm": 1.0106128454208374, "learning_rate": 1.4925931836567922e-05, "loss": 0.4767, "num_input_tokens_seen": 5829312, "step": 17310 }, { "epoch": 13.380989180834622, "grad_norm": 1.3535799980163574, "learning_rate": 1.4910503738948477e-05, "loss": 0.4331, "num_input_tokens_seen": 5831040, "step": 17315 }, { "epoch": 13.384853168469862, "grad_norm": 1.231218934059143, "learning_rate": 1.4895080230092363e-05, "loss": 0.4056, "num_input_tokens_seen": 5832640, "step": 17320 }, { "epoch": 13.3887171561051, "grad_norm": 1.6127263307571411, "learning_rate": 1.4879661317014279e-05, "loss": 0.4019, "num_input_tokens_seen": 5834368, "step": 17325 }, { "epoch": 13.39258114374034, "grad_norm": 1.3675256967544556, "learning_rate": 1.4864247006726839e-05, "loss": 0.492, "num_input_tokens_seen": 5836128, "step": 17330 }, { "epoch": 13.396445131375579, "grad_norm": 0.7475222945213318, "learning_rate": 1.4848837306240554e-05, "loss": 0.3219, "num_input_tokens_seen": 5837824, "step": 17335 }, { "epoch": 13.400309119010819, "grad_norm": 0.6741426587104797, "learning_rate": 1.4833432222563858e-05, "loss": 0.3914, "num_input_tokens_seen": 5839488, "step": 17340 }, { "epoch": 13.40417310664606, "grad_norm": 1.2184374332427979, "learning_rate": 1.4818031762703078e-05, "loss": 0.4291, "num_input_tokens_seen": 5840992, "step": 17345 }, { "epoch": 13.408037094281298, "grad_norm": 1.299586534500122, "learning_rate": 1.480263593366242e-05, "loss": 0.5117, "num_input_tokens_seen": 5842848, "step": 17350 }, { "epoch": 13.411901081916538, "grad_norm": 0.6774970293045044, "learning_rate": 1.4787244742444012e-05, "loss": 0.5343, "num_input_tokens_seen": 5844544, "step": 17355 }, { "epoch": 13.415765069551778, "grad_norm": 1.0468860864639282, "learning_rate": 1.4771858196047856e-05, "loss": 0.4017, "num_input_tokens_seen": 5845984, "step": 17360 }, { "epoch": 13.419629057187016, "grad_norm": 1.3412933349609375, "learning_rate": 1.475647630147185e-05, "loss": 0.4552, "num_input_tokens_seen": 5847680, "step": 17365 }, { "epoch": 13.423493044822257, "grad_norm": 0.8775840997695923, "learning_rate": 1.4741099065711761e-05, "loss": 0.4037, "num_input_tokens_seen": 5849312, "step": 17370 }, { "epoch": 13.427357032457497, "grad_norm": 0.8815622329711914, "learning_rate": 1.4725726495761267e-05, "loss": 0.6059, "num_input_tokens_seen": 5850944, "step": 17375 }, { "epoch": 13.431221020092735, "grad_norm": 0.8310419321060181, "learning_rate": 1.4710358598611886e-05, "loss": 0.3873, "num_input_tokens_seen": 5852736, "step": 17380 }, { "epoch": 13.435085007727976, "grad_norm": 0.8307220935821533, "learning_rate": 1.4694995381253035e-05, "loss": 0.375, "num_input_tokens_seen": 5854752, "step": 17385 }, { "epoch": 13.438948995363216, "grad_norm": 1.3741133213043213, "learning_rate": 1.4679636850672002e-05, "loss": 0.4892, "num_input_tokens_seen": 5856128, "step": 17390 }, { "epoch": 13.442812982998454, "grad_norm": 1.0832337141036987, "learning_rate": 1.466428301385393e-05, "loss": 0.3983, "num_input_tokens_seen": 5857792, "step": 17395 }, { "epoch": 13.446676970633694, "grad_norm": 1.02347731590271, "learning_rate": 1.4648933877781846e-05, "loss": 0.5371, "num_input_tokens_seen": 5859296, "step": 17400 }, { "epoch": 13.450540958268933, "grad_norm": 1.5480403900146484, "learning_rate": 1.463358944943663e-05, "loss": 0.4339, "num_input_tokens_seen": 5860896, "step": 17405 }, { "epoch": 13.454404945904173, "grad_norm": 1.1886005401611328, "learning_rate": 1.4618249735797005e-05, "loss": 0.632, "num_input_tokens_seen": 5862784, "step": 17410 }, { "epoch": 13.458268933539413, "grad_norm": 1.2978237867355347, "learning_rate": 1.460291474383958e-05, "loss": 0.4678, "num_input_tokens_seen": 5864704, "step": 17415 }, { "epoch": 13.462132921174652, "grad_norm": 1.0885722637176514, "learning_rate": 1.4587584480538796e-05, "loss": 0.5523, "num_input_tokens_seen": 5866432, "step": 17420 }, { "epoch": 13.465996908809892, "grad_norm": 0.8833515644073486, "learning_rate": 1.4572258952866945e-05, "loss": 0.4755, "num_input_tokens_seen": 5868160, "step": 17425 }, { "epoch": 13.469860896445132, "grad_norm": 0.9432836174964905, "learning_rate": 1.4556938167794166e-05, "loss": 0.4115, "num_input_tokens_seen": 5869824, "step": 17430 }, { "epoch": 13.47372488408037, "grad_norm": 0.8784289360046387, "learning_rate": 1.4541622132288445e-05, "loss": 0.3649, "num_input_tokens_seen": 5871328, "step": 17435 }, { "epoch": 13.47758887171561, "grad_norm": 1.3257759809494019, "learning_rate": 1.4526310853315625e-05, "loss": 0.4236, "num_input_tokens_seen": 5873088, "step": 17440 }, { "epoch": 13.481452859350851, "grad_norm": 0.6669439673423767, "learning_rate": 1.4511004337839352e-05, "loss": 0.3409, "num_input_tokens_seen": 5874560, "step": 17445 }, { "epoch": 13.48531684698609, "grad_norm": 0.9230324625968933, "learning_rate": 1.449570259282112e-05, "loss": 0.4795, "num_input_tokens_seen": 5876320, "step": 17450 }, { "epoch": 13.48918083462133, "grad_norm": 0.5733504295349121, "learning_rate": 1.4480405625220261e-05, "loss": 0.3797, "num_input_tokens_seen": 5878080, "step": 17455 }, { "epoch": 13.493044822256568, "grad_norm": 1.492041826248169, "learning_rate": 1.4465113441993918e-05, "loss": 0.3733, "num_input_tokens_seen": 5879744, "step": 17460 }, { "epoch": 13.496908809891808, "grad_norm": 0.8571057319641113, "learning_rate": 1.4449826050097065e-05, "loss": 0.4254, "num_input_tokens_seen": 5881344, "step": 17465 }, { "epoch": 13.500772797527048, "grad_norm": 1.5239111185073853, "learning_rate": 1.443454345648252e-05, "loss": 0.6399, "num_input_tokens_seen": 5882976, "step": 17470 }, { "epoch": 13.504636785162287, "grad_norm": 0.8819478750228882, "learning_rate": 1.4419265668100868e-05, "loss": 0.5822, "num_input_tokens_seen": 5884672, "step": 17475 }, { "epoch": 13.508500772797527, "grad_norm": 1.0827730894088745, "learning_rate": 1.440399269190057e-05, "loss": 0.4874, "num_input_tokens_seen": 5886304, "step": 17480 }, { "epoch": 13.512364760432767, "grad_norm": 0.6740099787712097, "learning_rate": 1.4388724534827852e-05, "loss": 0.3337, "num_input_tokens_seen": 5887904, "step": 17485 }, { "epoch": 13.516228748068006, "grad_norm": 1.1762630939483643, "learning_rate": 1.4373461203826767e-05, "loss": 0.7016, "num_input_tokens_seen": 5889504, "step": 17490 }, { "epoch": 13.520092735703246, "grad_norm": 1.3261339664459229, "learning_rate": 1.4358202705839163e-05, "loss": 0.6462, "num_input_tokens_seen": 5891104, "step": 17495 }, { "epoch": 13.523956723338486, "grad_norm": 1.3239490985870361, "learning_rate": 1.4342949047804688e-05, "loss": 0.4129, "num_input_tokens_seen": 5892896, "step": 17500 }, { "epoch": 13.527820710973725, "grad_norm": 0.9675479531288147, "learning_rate": 1.4327700236660824e-05, "loss": 0.388, "num_input_tokens_seen": 5894624, "step": 17505 }, { "epoch": 13.531684698608965, "grad_norm": 0.6466054320335388, "learning_rate": 1.4312456279342801e-05, "loss": 0.3555, "num_input_tokens_seen": 5896160, "step": 17510 }, { "epoch": 13.535548686244205, "grad_norm": 1.3418382406234741, "learning_rate": 1.4297217182783661e-05, "loss": 0.5277, "num_input_tokens_seen": 5897696, "step": 17515 }, { "epoch": 13.539412673879443, "grad_norm": 1.1404320001602173, "learning_rate": 1.4281982953914252e-05, "loss": 0.3543, "num_input_tokens_seen": 5899264, "step": 17520 }, { "epoch": 13.543276661514684, "grad_norm": 1.0111879110336304, "learning_rate": 1.4266753599663179e-05, "loss": 0.4115, "num_input_tokens_seen": 5900960, "step": 17525 }, { "epoch": 13.547140649149922, "grad_norm": 0.867690920829773, "learning_rate": 1.4251529126956852e-05, "loss": 0.4779, "num_input_tokens_seen": 5902592, "step": 17530 }, { "epoch": 13.551004636785162, "grad_norm": 0.762649655342102, "learning_rate": 1.423630954271944e-05, "loss": 0.3349, "num_input_tokens_seen": 5904096, "step": 17535 }, { "epoch": 13.554868624420402, "grad_norm": 0.7434679865837097, "learning_rate": 1.4221094853872915e-05, "loss": 0.4182, "num_input_tokens_seen": 5905696, "step": 17540 }, { "epoch": 13.55873261205564, "grad_norm": 0.8346335887908936, "learning_rate": 1.4205885067337005e-05, "loss": 0.6253, "num_input_tokens_seen": 5907328, "step": 17545 }, { "epoch": 13.562596599690881, "grad_norm": 1.1697518825531006, "learning_rate": 1.4190680190029209e-05, "loss": 0.5352, "num_input_tokens_seen": 5908896, "step": 17550 }, { "epoch": 13.566460587326121, "grad_norm": 0.8592986464500427, "learning_rate": 1.4175480228864788e-05, "loss": 0.3885, "num_input_tokens_seen": 5910752, "step": 17555 }, { "epoch": 13.57032457496136, "grad_norm": 1.1379214525222778, "learning_rate": 1.4160285190756795e-05, "loss": 0.5543, "num_input_tokens_seen": 5912320, "step": 17560 }, { "epoch": 13.5741885625966, "grad_norm": 1.1889435052871704, "learning_rate": 1.4145095082616012e-05, "loss": 0.3924, "num_input_tokens_seen": 5914048, "step": 17565 }, { "epoch": 13.578052550231838, "grad_norm": 0.5621602535247803, "learning_rate": 1.4129909911350981e-05, "loss": 0.4783, "num_input_tokens_seen": 5915552, "step": 17570 }, { "epoch": 13.581916537867079, "grad_norm": 0.8038898706436157, "learning_rate": 1.4114729683868033e-05, "loss": 0.4696, "num_input_tokens_seen": 5917312, "step": 17575 }, { "epoch": 13.585780525502319, "grad_norm": 1.0312398672103882, "learning_rate": 1.4099554407071214e-05, "loss": 0.4891, "num_input_tokens_seen": 5919136, "step": 17580 }, { "epoch": 13.589644513137557, "grad_norm": 0.8520827293395996, "learning_rate": 1.4084384087862331e-05, "loss": 0.5221, "num_input_tokens_seen": 5920736, "step": 17585 }, { "epoch": 13.593508500772797, "grad_norm": 1.1908409595489502, "learning_rate": 1.4069218733140938e-05, "loss": 0.3839, "num_input_tokens_seen": 5922336, "step": 17590 }, { "epoch": 13.597372488408038, "grad_norm": 1.2118302583694458, "learning_rate": 1.405405834980431e-05, "loss": 0.5544, "num_input_tokens_seen": 5924160, "step": 17595 }, { "epoch": 13.601236476043276, "grad_norm": 0.9132462739944458, "learning_rate": 1.4038902944747514e-05, "loss": 0.5251, "num_input_tokens_seen": 5925600, "step": 17600 }, { "epoch": 13.605100463678516, "grad_norm": 2.2415902614593506, "learning_rate": 1.4023752524863293e-05, "loss": 0.4501, "num_input_tokens_seen": 5927200, "step": 17605 }, { "epoch": 13.608964451313756, "grad_norm": 1.6445776224136353, "learning_rate": 1.4008607097042165e-05, "loss": 0.4369, "num_input_tokens_seen": 5928704, "step": 17610 }, { "epoch": 13.612828438948995, "grad_norm": 1.1516083478927612, "learning_rate": 1.3993466668172353e-05, "loss": 0.4737, "num_input_tokens_seen": 5930464, "step": 17615 }, { "epoch": 13.616692426584235, "grad_norm": 1.0226517915725708, "learning_rate": 1.3978331245139817e-05, "loss": 0.5362, "num_input_tokens_seen": 5932096, "step": 17620 }, { "epoch": 13.620556414219475, "grad_norm": 0.6792202591896057, "learning_rate": 1.3963200834828238e-05, "loss": 0.3571, "num_input_tokens_seen": 5933984, "step": 17625 }, { "epoch": 13.624420401854714, "grad_norm": 0.9299296140670776, "learning_rate": 1.3948075444119013e-05, "loss": 0.4151, "num_input_tokens_seen": 5935808, "step": 17630 }, { "epoch": 13.628284389489954, "grad_norm": 0.6823188662528992, "learning_rate": 1.3932955079891257e-05, "loss": 0.3718, "num_input_tokens_seen": 5937536, "step": 17635 }, { "epoch": 13.632148377125194, "grad_norm": 1.5718982219696045, "learning_rate": 1.3917839749021805e-05, "loss": 0.5433, "num_input_tokens_seen": 5939168, "step": 17640 }, { "epoch": 13.636012364760433, "grad_norm": 1.0991766452789307, "learning_rate": 1.3902729458385216e-05, "loss": 0.3722, "num_input_tokens_seen": 5940992, "step": 17645 }, { "epoch": 13.639876352395673, "grad_norm": 0.6821610331535339, "learning_rate": 1.3887624214853729e-05, "loss": 0.3917, "num_input_tokens_seen": 5942752, "step": 17650 }, { "epoch": 13.643740340030911, "grad_norm": 1.1295009851455688, "learning_rate": 1.3872524025297298e-05, "loss": 0.5211, "num_input_tokens_seen": 5944448, "step": 17655 }, { "epoch": 13.647604327666151, "grad_norm": 1.3797234296798706, "learning_rate": 1.3857428896583579e-05, "loss": 0.4829, "num_input_tokens_seen": 5946112, "step": 17660 }, { "epoch": 13.651468315301392, "grad_norm": 0.837177038192749, "learning_rate": 1.3842338835577928e-05, "loss": 0.4677, "num_input_tokens_seen": 5947968, "step": 17665 }, { "epoch": 13.65533230293663, "grad_norm": 1.6032133102416992, "learning_rate": 1.3827253849143388e-05, "loss": 0.6139, "num_input_tokens_seen": 5950048, "step": 17670 }, { "epoch": 13.65919629057187, "grad_norm": 1.0573242902755737, "learning_rate": 1.3812173944140705e-05, "loss": 0.5164, "num_input_tokens_seen": 5951744, "step": 17675 }, { "epoch": 13.66306027820711, "grad_norm": 1.1338249444961548, "learning_rate": 1.3797099127428325e-05, "loss": 0.3663, "num_input_tokens_seen": 5953344, "step": 17680 }, { "epoch": 13.666924265842349, "grad_norm": 1.027435302734375, "learning_rate": 1.3782029405862354e-05, "loss": 0.7837, "num_input_tokens_seen": 5955040, "step": 17685 }, { "epoch": 13.670788253477589, "grad_norm": 0.780169665813446, "learning_rate": 1.3766964786296587e-05, "loss": 0.4431, "num_input_tokens_seen": 5956864, "step": 17690 }, { "epoch": 13.674652241112828, "grad_norm": 1.0538049936294556, "learning_rate": 1.3751905275582513e-05, "loss": 0.3168, "num_input_tokens_seen": 5958624, "step": 17695 }, { "epoch": 13.678516228748068, "grad_norm": 0.8076536655426025, "learning_rate": 1.373685088056928e-05, "loss": 0.3471, "num_input_tokens_seen": 5960128, "step": 17700 }, { "epoch": 13.682380216383308, "grad_norm": 0.6605492234230042, "learning_rate": 1.3721801608103707e-05, "loss": 0.4429, "num_input_tokens_seen": 5961792, "step": 17705 }, { "epoch": 13.686244204018546, "grad_norm": 0.925927460193634, "learning_rate": 1.3706757465030317e-05, "loss": 0.504, "num_input_tokens_seen": 5963296, "step": 17710 }, { "epoch": 13.690108191653787, "grad_norm": 0.958798348903656, "learning_rate": 1.3691718458191255e-05, "loss": 0.526, "num_input_tokens_seen": 5965024, "step": 17715 }, { "epoch": 13.693972179289027, "grad_norm": 1.1849985122680664, "learning_rate": 1.3676684594426367e-05, "loss": 0.5262, "num_input_tokens_seen": 5966912, "step": 17720 }, { "epoch": 13.697836166924265, "grad_norm": 0.7317939400672913, "learning_rate": 1.366165588057314e-05, "loss": 0.4025, "num_input_tokens_seen": 5968352, "step": 17725 }, { "epoch": 13.701700154559505, "grad_norm": 1.3921395540237427, "learning_rate": 1.3646632323466724e-05, "loss": 0.5701, "num_input_tokens_seen": 5969888, "step": 17730 }, { "epoch": 13.705564142194746, "grad_norm": 1.0640056133270264, "learning_rate": 1.3631613929939918e-05, "loss": 0.4607, "num_input_tokens_seen": 5971456, "step": 17735 }, { "epoch": 13.709428129829984, "grad_norm": 1.3448582887649536, "learning_rate": 1.3616600706823168e-05, "loss": 0.4327, "num_input_tokens_seen": 5973184, "step": 17740 }, { "epoch": 13.713292117465224, "grad_norm": 0.7118769884109497, "learning_rate": 1.3601592660944601e-05, "loss": 0.3326, "num_input_tokens_seen": 5975008, "step": 17745 }, { "epoch": 13.717156105100464, "grad_norm": 0.8192259669303894, "learning_rate": 1.358658979912995e-05, "loss": 0.4456, "num_input_tokens_seen": 5976640, "step": 17750 }, { "epoch": 13.721020092735703, "grad_norm": 0.746871292591095, "learning_rate": 1.3571592128202606e-05, "loss": 0.5462, "num_input_tokens_seen": 5978240, "step": 17755 }, { "epoch": 13.724884080370943, "grad_norm": 1.1069122552871704, "learning_rate": 1.3556599654983613e-05, "loss": 0.4174, "num_input_tokens_seen": 5979616, "step": 17760 }, { "epoch": 13.728748068006183, "grad_norm": 1.7651668787002563, "learning_rate": 1.3541612386291627e-05, "loss": 0.5961, "num_input_tokens_seen": 5981280, "step": 17765 }, { "epoch": 13.732612055641422, "grad_norm": 1.578688621520996, "learning_rate": 1.3526630328942949e-05, "loss": 0.4627, "num_input_tokens_seen": 5983200, "step": 17770 }, { "epoch": 13.736476043276662, "grad_norm": 1.2938232421875, "learning_rate": 1.3511653489751502e-05, "loss": 0.5388, "num_input_tokens_seen": 5984992, "step": 17775 }, { "epoch": 13.7403400309119, "grad_norm": 1.0513287782669067, "learning_rate": 1.3496681875528855e-05, "loss": 0.4048, "num_input_tokens_seen": 5986592, "step": 17780 }, { "epoch": 13.74420401854714, "grad_norm": 1.1488393545150757, "learning_rate": 1.3481715493084185e-05, "loss": 0.3509, "num_input_tokens_seen": 5988128, "step": 17785 }, { "epoch": 13.74806800618238, "grad_norm": 1.0711207389831543, "learning_rate": 1.3466754349224287e-05, "loss": 0.8261, "num_input_tokens_seen": 5989728, "step": 17790 }, { "epoch": 13.75193199381762, "grad_norm": 0.9399608373641968, "learning_rate": 1.3451798450753569e-05, "loss": 0.4256, "num_input_tokens_seen": 5991424, "step": 17795 }, { "epoch": 13.75579598145286, "grad_norm": 0.8121631145477295, "learning_rate": 1.343684780447409e-05, "loss": 0.3487, "num_input_tokens_seen": 5993088, "step": 17800 }, { "epoch": 13.7596599690881, "grad_norm": 1.0210082530975342, "learning_rate": 1.3421902417185473e-05, "loss": 0.3798, "num_input_tokens_seen": 5994624, "step": 17805 }, { "epoch": 13.763523956723338, "grad_norm": 0.8984750509262085, "learning_rate": 1.3406962295684977e-05, "loss": 0.5294, "num_input_tokens_seen": 5996320, "step": 17810 }, { "epoch": 13.767387944358578, "grad_norm": 0.7295047640800476, "learning_rate": 1.3392027446767449e-05, "loss": 0.3785, "num_input_tokens_seen": 5997792, "step": 17815 }, { "epoch": 13.771251931993817, "grad_norm": 1.3573405742645264, "learning_rate": 1.3377097877225363e-05, "loss": 0.4954, "num_input_tokens_seen": 5999168, "step": 17820 }, { "epoch": 13.775115919629057, "grad_norm": 1.8452329635620117, "learning_rate": 1.3362173593848774e-05, "loss": 0.5271, "num_input_tokens_seen": 6001152, "step": 17825 }, { "epoch": 13.778979907264297, "grad_norm": 0.9009010791778564, "learning_rate": 1.3347254603425327e-05, "loss": 0.3527, "num_input_tokens_seen": 6002816, "step": 17830 }, { "epoch": 13.782843894899536, "grad_norm": 0.8482018113136292, "learning_rate": 1.3332340912740263e-05, "loss": 0.3894, "num_input_tokens_seen": 6004288, "step": 17835 }, { "epoch": 13.786707882534776, "grad_norm": 0.8684111833572388, "learning_rate": 1.3317432528576435e-05, "loss": 0.3558, "num_input_tokens_seen": 6005696, "step": 17840 }, { "epoch": 13.790571870170016, "grad_norm": 0.8285952806472778, "learning_rate": 1.3302529457714258e-05, "loss": 0.4215, "num_input_tokens_seen": 6007584, "step": 17845 }, { "epoch": 13.794435857805254, "grad_norm": 0.8430747985839844, "learning_rate": 1.3287631706931727e-05, "loss": 0.4051, "num_input_tokens_seen": 6008992, "step": 17850 }, { "epoch": 13.798299845440495, "grad_norm": 0.6468982100486755, "learning_rate": 1.3272739283004449e-05, "loss": 0.3529, "num_input_tokens_seen": 6010432, "step": 17855 }, { "epoch": 13.802163833075735, "grad_norm": 1.6936023235321045, "learning_rate": 1.3257852192705578e-05, "loss": 0.5343, "num_input_tokens_seen": 6012288, "step": 17860 }, { "epoch": 13.806027820710973, "grad_norm": 0.7257324457168579, "learning_rate": 1.3242970442805846e-05, "loss": 0.3075, "num_input_tokens_seen": 6014272, "step": 17865 }, { "epoch": 13.809891808346213, "grad_norm": 1.4906940460205078, "learning_rate": 1.3228094040073574e-05, "loss": 0.4854, "num_input_tokens_seen": 6015904, "step": 17870 }, { "epoch": 13.813755795981454, "grad_norm": 0.7643553018569946, "learning_rate": 1.3213222991274616e-05, "loss": 0.3776, "num_input_tokens_seen": 6017728, "step": 17875 }, { "epoch": 13.817619783616692, "grad_norm": 0.6574145555496216, "learning_rate": 1.3198357303172443e-05, "loss": 0.6807, "num_input_tokens_seen": 6019168, "step": 17880 }, { "epoch": 13.821483771251932, "grad_norm": 0.8369853496551514, "learning_rate": 1.3183496982528031e-05, "loss": 0.5536, "num_input_tokens_seen": 6020960, "step": 17885 }, { "epoch": 13.825347758887172, "grad_norm": 1.79603910446167, "learning_rate": 1.3168642036099973e-05, "loss": 0.6015, "num_input_tokens_seen": 6022592, "step": 17890 }, { "epoch": 13.829211746522411, "grad_norm": 0.9928605556488037, "learning_rate": 1.3153792470644371e-05, "loss": 0.3938, "num_input_tokens_seen": 6024192, "step": 17895 }, { "epoch": 13.833075734157651, "grad_norm": 1.4848525524139404, "learning_rate": 1.3138948292914896e-05, "loss": 0.4576, "num_input_tokens_seen": 6026048, "step": 17900 }, { "epoch": 13.83693972179289, "grad_norm": 0.8158094882965088, "learning_rate": 1.3124109509662772e-05, "loss": 0.34, "num_input_tokens_seen": 6027712, "step": 17905 }, { "epoch": 13.84080370942813, "grad_norm": 1.0486464500427246, "learning_rate": 1.3109276127636763e-05, "loss": 0.5602, "num_input_tokens_seen": 6029376, "step": 17910 }, { "epoch": 13.84466769706337, "grad_norm": 0.7981244921684265, "learning_rate": 1.3094448153583171e-05, "loss": 0.3841, "num_input_tokens_seen": 6031136, "step": 17915 }, { "epoch": 13.848531684698608, "grad_norm": 1.7117120027542114, "learning_rate": 1.3079625594245865e-05, "loss": 0.4455, "num_input_tokens_seen": 6032768, "step": 17920 }, { "epoch": 13.852395672333849, "grad_norm": 1.067049503326416, "learning_rate": 1.3064808456366228e-05, "loss": 0.5305, "num_input_tokens_seen": 6034720, "step": 17925 }, { "epoch": 13.856259659969089, "grad_norm": 1.3764705657958984, "learning_rate": 1.3049996746683188e-05, "loss": 0.4539, "num_input_tokens_seen": 6036320, "step": 17930 }, { "epoch": 13.860123647604327, "grad_norm": 1.421942114830017, "learning_rate": 1.3035190471933193e-05, "loss": 0.3943, "num_input_tokens_seen": 6037952, "step": 17935 }, { "epoch": 13.863987635239567, "grad_norm": 0.8411855697631836, "learning_rate": 1.3020389638850223e-05, "loss": 0.4462, "num_input_tokens_seen": 6039968, "step": 17940 }, { "epoch": 13.867851622874806, "grad_norm": 1.1592323780059814, "learning_rate": 1.300559425416579e-05, "loss": 0.4788, "num_input_tokens_seen": 6041760, "step": 17945 }, { "epoch": 13.871715610510046, "grad_norm": 0.9052354693412781, "learning_rate": 1.2990804324608913e-05, "loss": 0.3635, "num_input_tokens_seen": 6043296, "step": 17950 }, { "epoch": 13.875579598145286, "grad_norm": 0.7216789722442627, "learning_rate": 1.2976019856906146e-05, "loss": 0.4022, "num_input_tokens_seen": 6045216, "step": 17955 }, { "epoch": 13.879443585780525, "grad_norm": 1.6459025144577026, "learning_rate": 1.296124085778157e-05, "loss": 0.4077, "num_input_tokens_seen": 6046720, "step": 17960 }, { "epoch": 13.883307573415765, "grad_norm": 1.7749695777893066, "learning_rate": 1.294646733395675e-05, "loss": 0.5957, "num_input_tokens_seen": 6048416, "step": 17965 }, { "epoch": 13.887171561051005, "grad_norm": 1.3303629159927368, "learning_rate": 1.2931699292150767e-05, "loss": 0.54, "num_input_tokens_seen": 6050016, "step": 17970 }, { "epoch": 13.891035548686244, "grad_norm": 3.0991787910461426, "learning_rate": 1.2916936739080226e-05, "loss": 0.6008, "num_input_tokens_seen": 6051936, "step": 17975 }, { "epoch": 13.894899536321484, "grad_norm": 0.8185438513755798, "learning_rate": 1.2902179681459215e-05, "loss": 0.5259, "num_input_tokens_seen": 6054048, "step": 17980 }, { "epoch": 13.898763523956724, "grad_norm": 1.06870698928833, "learning_rate": 1.2887428125999329e-05, "loss": 0.4416, "num_input_tokens_seen": 6055616, "step": 17985 }, { "epoch": 13.902627511591962, "grad_norm": 0.898955225944519, "learning_rate": 1.2872682079409678e-05, "loss": 0.4299, "num_input_tokens_seen": 6057376, "step": 17990 }, { "epoch": 13.906491499227203, "grad_norm": 0.8640966415405273, "learning_rate": 1.2857941548396846e-05, "loss": 0.4712, "num_input_tokens_seen": 6059168, "step": 17995 }, { "epoch": 13.910355486862443, "grad_norm": 0.7713926434516907, "learning_rate": 1.2843206539664903e-05, "loss": 0.4251, "num_input_tokens_seen": 6060960, "step": 18000 }, { "epoch": 13.914219474497681, "grad_norm": 1.2674098014831543, "learning_rate": 1.2828477059915443e-05, "loss": 0.381, "num_input_tokens_seen": 6062400, "step": 18005 }, { "epoch": 13.918083462132921, "grad_norm": 0.9750695824623108, "learning_rate": 1.2813753115847504e-05, "loss": 0.4404, "num_input_tokens_seen": 6064128, "step": 18010 }, { "epoch": 13.921947449768162, "grad_norm": 1.0242923498153687, "learning_rate": 1.2799034714157632e-05, "loss": 0.429, "num_input_tokens_seen": 6065696, "step": 18015 }, { "epoch": 13.9258114374034, "grad_norm": 1.059396743774414, "learning_rate": 1.2784321861539828e-05, "loss": 0.3605, "num_input_tokens_seen": 6067392, "step": 18020 }, { "epoch": 13.92967542503864, "grad_norm": 0.859088659286499, "learning_rate": 1.2769614564685611e-05, "loss": 0.7174, "num_input_tokens_seen": 6069120, "step": 18025 }, { "epoch": 13.933539412673879, "grad_norm": 0.9776920080184937, "learning_rate": 1.2754912830283933e-05, "loss": 0.4517, "num_input_tokens_seen": 6070784, "step": 18030 }, { "epoch": 13.937403400309119, "grad_norm": 0.8003031611442566, "learning_rate": 1.2740216665021231e-05, "loss": 0.4904, "num_input_tokens_seen": 6072736, "step": 18035 }, { "epoch": 13.94126738794436, "grad_norm": 1.214372992515564, "learning_rate": 1.2725526075581404e-05, "loss": 0.4722, "num_input_tokens_seen": 6074368, "step": 18040 }, { "epoch": 13.945131375579598, "grad_norm": 1.0009530782699585, "learning_rate": 1.2710841068645834e-05, "loss": 0.3629, "num_input_tokens_seen": 6076032, "step": 18045 }, { "epoch": 13.948995363214838, "grad_norm": 1.2682515382766724, "learning_rate": 1.2696161650893346e-05, "loss": 0.435, "num_input_tokens_seen": 6077536, "step": 18050 }, { "epoch": 13.952859350850078, "grad_norm": 1.3694822788238525, "learning_rate": 1.2681487829000214e-05, "loss": 0.5359, "num_input_tokens_seen": 6079104, "step": 18055 }, { "epoch": 13.956723338485316, "grad_norm": 0.9138496518135071, "learning_rate": 1.2666819609640196e-05, "loss": 0.4062, "num_input_tokens_seen": 6080832, "step": 18060 }, { "epoch": 13.960587326120557, "grad_norm": 1.0146868228912354, "learning_rate": 1.2652156999484482e-05, "loss": 0.5014, "num_input_tokens_seen": 6082176, "step": 18065 }, { "epoch": 13.964451313755795, "grad_norm": 1.5167945623397827, "learning_rate": 1.2637500005201713e-05, "loss": 0.6942, "num_input_tokens_seen": 6084000, "step": 18070 }, { "epoch": 13.968315301391035, "grad_norm": 0.8254223465919495, "learning_rate": 1.2622848633457979e-05, "loss": 0.4165, "num_input_tokens_seen": 6085600, "step": 18075 }, { "epoch": 13.972179289026275, "grad_norm": 0.8676234483718872, "learning_rate": 1.26082028909168e-05, "loss": 0.3883, "num_input_tokens_seen": 6087520, "step": 18080 }, { "epoch": 13.976043276661514, "grad_norm": 0.5973824262619019, "learning_rate": 1.2593562784239166e-05, "loss": 0.4494, "num_input_tokens_seen": 6089088, "step": 18085 }, { "epoch": 13.979907264296754, "grad_norm": 1.0986071825027466, "learning_rate": 1.2578928320083472e-05, "loss": 0.4158, "num_input_tokens_seen": 6090720, "step": 18090 }, { "epoch": 13.983771251931994, "grad_norm": 0.9684088826179504, "learning_rate": 1.256429950510557e-05, "loss": 0.3538, "num_input_tokens_seen": 6092128, "step": 18095 }, { "epoch": 13.987635239567233, "grad_norm": 1.0249459743499756, "learning_rate": 1.2549676345958727e-05, "loss": 0.4504, "num_input_tokens_seen": 6093984, "step": 18100 }, { "epoch": 13.991499227202473, "grad_norm": 0.8593318462371826, "learning_rate": 1.2535058849293646e-05, "loss": 0.3199, "num_input_tokens_seen": 6095840, "step": 18105 }, { "epoch": 13.995363214837713, "grad_norm": 1.183746576309204, "learning_rate": 1.252044702175845e-05, "loss": 0.3513, "num_input_tokens_seen": 6097568, "step": 18110 }, { "epoch": 13.999227202472952, "grad_norm": 0.915687084197998, "learning_rate": 1.2505840869998686e-05, "loss": 0.4771, "num_input_tokens_seen": 6099424, "step": 18115 }, { "epoch": 14.0, "eval_loss": 0.4655553102493286, "eval_runtime": 6.3679, "eval_samples_per_second": 90.296, "eval_steps_per_second": 22.613, "num_input_tokens_seen": 6099600, "step": 18116 }, { "epoch": 14.003091190108192, "grad_norm": 0.8590710163116455, "learning_rate": 1.2491240400657309e-05, "loss": 0.5069, "num_input_tokens_seen": 6100976, "step": 18120 }, { "epoch": 14.006955177743432, "grad_norm": 1.268460750579834, "learning_rate": 1.2476645620374708e-05, "loss": 0.395, "num_input_tokens_seen": 6102416, "step": 18125 }, { "epoch": 14.01081916537867, "grad_norm": 0.6208818554878235, "learning_rate": 1.2462056535788689e-05, "loss": 0.4105, "num_input_tokens_seen": 6103888, "step": 18130 }, { "epoch": 14.01468315301391, "grad_norm": 0.8854100108146667, "learning_rate": 1.2447473153534444e-05, "loss": 0.5716, "num_input_tokens_seen": 6105584, "step": 18135 }, { "epoch": 14.018547140649149, "grad_norm": 0.6985594630241394, "learning_rate": 1.2432895480244583e-05, "loss": 0.3779, "num_input_tokens_seen": 6107664, "step": 18140 }, { "epoch": 14.02241112828439, "grad_norm": 0.6761245131492615, "learning_rate": 1.2418323522549122e-05, "loss": 0.4388, "num_input_tokens_seen": 6109488, "step": 18145 }, { "epoch": 14.02627511591963, "grad_norm": 1.2983567714691162, "learning_rate": 1.2403757287075468e-05, "loss": 0.3765, "num_input_tokens_seen": 6111184, "step": 18150 }, { "epoch": 14.030139103554868, "grad_norm": 0.6661332249641418, "learning_rate": 1.2389196780448425e-05, "loss": 0.4412, "num_input_tokens_seen": 6112880, "step": 18155 }, { "epoch": 14.034003091190108, "grad_norm": 0.9465147852897644, "learning_rate": 1.2374642009290213e-05, "loss": 0.4972, "num_input_tokens_seen": 6114416, "step": 18160 }, { "epoch": 14.037867078825348, "grad_norm": 1.084678292274475, "learning_rate": 1.2360092980220434e-05, "loss": 0.3627, "num_input_tokens_seen": 6116144, "step": 18165 }, { "epoch": 14.041731066460587, "grad_norm": 1.1848642826080322, "learning_rate": 1.2345549699856065e-05, "loss": 0.3676, "num_input_tokens_seen": 6117552, "step": 18170 }, { "epoch": 14.045595054095827, "grad_norm": 0.986496090888977, "learning_rate": 1.2331012174811477e-05, "loss": 0.3943, "num_input_tokens_seen": 6119312, "step": 18175 }, { "epoch": 14.049459041731067, "grad_norm": 1.676842212677002, "learning_rate": 1.2316480411698423e-05, "loss": 0.3909, "num_input_tokens_seen": 6120880, "step": 18180 }, { "epoch": 14.053323029366306, "grad_norm": 1.6902800798416138, "learning_rate": 1.2301954417126035e-05, "loss": 0.4455, "num_input_tokens_seen": 6122576, "step": 18185 }, { "epoch": 14.057187017001546, "grad_norm": 1.1927114725112915, "learning_rate": 1.2287434197700817e-05, "loss": 0.4894, "num_input_tokens_seen": 6124304, "step": 18190 }, { "epoch": 14.061051004636786, "grad_norm": 0.9883570671081543, "learning_rate": 1.2272919760026666e-05, "loss": 0.4293, "num_input_tokens_seen": 6126000, "step": 18195 }, { "epoch": 14.064914992272024, "grad_norm": 0.9842354655265808, "learning_rate": 1.225841111070482e-05, "loss": 0.3823, "num_input_tokens_seen": 6127408, "step": 18200 }, { "epoch": 14.068778979907265, "grad_norm": 0.5687853097915649, "learning_rate": 1.2243908256333917e-05, "loss": 0.3739, "num_input_tokens_seen": 6129168, "step": 18205 }, { "epoch": 14.072642967542503, "grad_norm": 1.2609879970550537, "learning_rate": 1.222941120350993e-05, "loss": 0.381, "num_input_tokens_seen": 6130864, "step": 18210 }, { "epoch": 14.076506955177743, "grad_norm": 1.2527830600738525, "learning_rate": 1.2214919958826206e-05, "loss": 0.3887, "num_input_tokens_seen": 6132720, "step": 18215 }, { "epoch": 14.080370942812984, "grad_norm": 1.0685683488845825, "learning_rate": 1.2200434528873456e-05, "loss": 0.3778, "num_input_tokens_seen": 6134352, "step": 18220 }, { "epoch": 14.084234930448222, "grad_norm": 0.872532069683075, "learning_rate": 1.2185954920239725e-05, "loss": 0.4569, "num_input_tokens_seen": 6136304, "step": 18225 }, { "epoch": 14.088098918083462, "grad_norm": 1.1020475625991821, "learning_rate": 1.2171481139510446e-05, "loss": 0.5579, "num_input_tokens_seen": 6138064, "step": 18230 }, { "epoch": 14.091962905718702, "grad_norm": 0.8840386867523193, "learning_rate": 1.2157013193268371e-05, "loss": 0.402, "num_input_tokens_seen": 6139728, "step": 18235 }, { "epoch": 14.09582689335394, "grad_norm": 1.2332487106323242, "learning_rate": 1.2142551088093599e-05, "loss": 0.4418, "num_input_tokens_seen": 6141552, "step": 18240 }, { "epoch": 14.099690880989181, "grad_norm": 1.0207489728927612, "learning_rate": 1.2128094830563605e-05, "loss": 0.4227, "num_input_tokens_seen": 6142928, "step": 18245 }, { "epoch": 14.103554868624421, "grad_norm": 1.0717719793319702, "learning_rate": 1.2113644427253165e-05, "loss": 0.3601, "num_input_tokens_seen": 6144464, "step": 18250 }, { "epoch": 14.10741885625966, "grad_norm": 0.9920199513435364, "learning_rate": 1.2099199884734416e-05, "loss": 0.365, "num_input_tokens_seen": 6146128, "step": 18255 }, { "epoch": 14.1112828438949, "grad_norm": 1.4034570455551147, "learning_rate": 1.2084761209576808e-05, "loss": 0.448, "num_input_tokens_seen": 6147920, "step": 18260 }, { "epoch": 14.115146831530138, "grad_norm": 1.4670897722244263, "learning_rate": 1.2070328408347159e-05, "loss": 0.4141, "num_input_tokens_seen": 6149616, "step": 18265 }, { "epoch": 14.119010819165378, "grad_norm": 1.4253268241882324, "learning_rate": 1.205590148760958e-05, "loss": 0.3336, "num_input_tokens_seen": 6151440, "step": 18270 }, { "epoch": 14.122874806800619, "grad_norm": 1.1600091457366943, "learning_rate": 1.2041480453925527e-05, "loss": 0.3226, "num_input_tokens_seen": 6153200, "step": 18275 }, { "epoch": 14.126738794435857, "grad_norm": 0.9176198244094849, "learning_rate": 1.2027065313853759e-05, "loss": 0.3552, "num_input_tokens_seen": 6154960, "step": 18280 }, { "epoch": 14.130602782071097, "grad_norm": 1.2029935121536255, "learning_rate": 1.2012656073950385e-05, "loss": 0.5913, "num_input_tokens_seen": 6156656, "step": 18285 }, { "epoch": 14.134466769706338, "grad_norm": 0.8582462072372437, "learning_rate": 1.1998252740768809e-05, "loss": 0.3525, "num_input_tokens_seen": 6158032, "step": 18290 }, { "epoch": 14.138330757341576, "grad_norm": 1.2329877614974976, "learning_rate": 1.198385532085974e-05, "loss": 0.4695, "num_input_tokens_seen": 6159792, "step": 18295 }, { "epoch": 14.142194744976816, "grad_norm": 0.9026252031326294, "learning_rate": 1.1969463820771231e-05, "loss": 0.3674, "num_input_tokens_seen": 6161328, "step": 18300 }, { "epoch": 14.146058732612056, "grad_norm": 0.9813036322593689, "learning_rate": 1.1955078247048614e-05, "loss": 0.3863, "num_input_tokens_seen": 6162896, "step": 18305 }, { "epoch": 14.149922720247295, "grad_norm": 2.2990882396698, "learning_rate": 1.1940698606234535e-05, "loss": 0.5371, "num_input_tokens_seen": 6164816, "step": 18310 }, { "epoch": 14.153786707882535, "grad_norm": 0.49347543716430664, "learning_rate": 1.1926324904868938e-05, "loss": 0.448, "num_input_tokens_seen": 6166448, "step": 18315 }, { "epoch": 14.157650695517773, "grad_norm": 0.5959368348121643, "learning_rate": 1.1911957149489058e-05, "loss": 0.479, "num_input_tokens_seen": 6167888, "step": 18320 }, { "epoch": 14.161514683153014, "grad_norm": 1.314574122428894, "learning_rate": 1.1897595346629459e-05, "loss": 0.483, "num_input_tokens_seen": 6169456, "step": 18325 }, { "epoch": 14.165378670788254, "grad_norm": 0.6838774085044861, "learning_rate": 1.1883239502821954e-05, "loss": 0.577, "num_input_tokens_seen": 6170896, "step": 18330 }, { "epoch": 14.169242658423492, "grad_norm": 0.7408624887466431, "learning_rate": 1.1868889624595686e-05, "loss": 0.4028, "num_input_tokens_seen": 6172624, "step": 18335 }, { "epoch": 14.173106646058732, "grad_norm": 1.057802438735962, "learning_rate": 1.1854545718477054e-05, "loss": 0.4626, "num_input_tokens_seen": 6174256, "step": 18340 }, { "epoch": 14.176970633693973, "grad_norm": 0.7474061846733093, "learning_rate": 1.1840207790989754e-05, "loss": 0.5558, "num_input_tokens_seen": 6176144, "step": 18345 }, { "epoch": 14.180834621329211, "grad_norm": 0.9518608450889587, "learning_rate": 1.1825875848654766e-05, "loss": 0.4108, "num_input_tokens_seen": 6177616, "step": 18350 }, { "epoch": 14.184698608964451, "grad_norm": 1.076192021369934, "learning_rate": 1.1811549897990335e-05, "loss": 0.4768, "num_input_tokens_seen": 6179248, "step": 18355 }, { "epoch": 14.188562596599692, "grad_norm": 1.6120450496673584, "learning_rate": 1.1797229945511983e-05, "loss": 0.36, "num_input_tokens_seen": 6181072, "step": 18360 }, { "epoch": 14.19242658423493, "grad_norm": 0.9554226398468018, "learning_rate": 1.1782915997732522e-05, "loss": 0.413, "num_input_tokens_seen": 6182896, "step": 18365 }, { "epoch": 14.19629057187017, "grad_norm": 1.1067838668823242, "learning_rate": 1.1768608061162028e-05, "loss": 0.514, "num_input_tokens_seen": 6184496, "step": 18370 }, { "epoch": 14.20015455950541, "grad_norm": 0.7863054275512695, "learning_rate": 1.1754306142307827e-05, "loss": 0.4537, "num_input_tokens_seen": 6186256, "step": 18375 }, { "epoch": 14.204018547140649, "grad_norm": 1.514715313911438, "learning_rate": 1.1740010247674518e-05, "loss": 0.4463, "num_input_tokens_seen": 6187792, "step": 18380 }, { "epoch": 14.207882534775889, "grad_norm": 0.8931658267974854, "learning_rate": 1.172572038376396e-05, "loss": 0.4108, "num_input_tokens_seen": 6189552, "step": 18385 }, { "epoch": 14.211746522411127, "grad_norm": 1.360833764076233, "learning_rate": 1.1711436557075268e-05, "loss": 0.6161, "num_input_tokens_seen": 6191120, "step": 18390 }, { "epoch": 14.215610510046368, "grad_norm": 1.2456501722335815, "learning_rate": 1.1697158774104802e-05, "loss": 0.3576, "num_input_tokens_seen": 6192848, "step": 18395 }, { "epoch": 14.219474497681608, "grad_norm": 0.972895085811615, "learning_rate": 1.1682887041346195e-05, "loss": 0.3873, "num_input_tokens_seen": 6194544, "step": 18400 }, { "epoch": 14.223338485316846, "grad_norm": 1.016701340675354, "learning_rate": 1.1668621365290322e-05, "loss": 0.4448, "num_input_tokens_seen": 6196336, "step": 18405 }, { "epoch": 14.227202472952087, "grad_norm": 1.1002992391586304, "learning_rate": 1.165436175242529e-05, "loss": 0.5982, "num_input_tokens_seen": 6198064, "step": 18410 }, { "epoch": 14.231066460587327, "grad_norm": 1.3610814809799194, "learning_rate": 1.1640108209236458e-05, "loss": 0.3903, "num_input_tokens_seen": 6199856, "step": 18415 }, { "epoch": 14.234930448222565, "grad_norm": 0.8846145868301392, "learning_rate": 1.162586074220642e-05, "loss": 0.4058, "num_input_tokens_seen": 6201456, "step": 18420 }, { "epoch": 14.238794435857805, "grad_norm": 0.9274861812591553, "learning_rate": 1.1611619357815012e-05, "loss": 0.4821, "num_input_tokens_seen": 6203152, "step": 18425 }, { "epoch": 14.242658423493046, "grad_norm": 0.645031750202179, "learning_rate": 1.1597384062539293e-05, "loss": 0.4033, "num_input_tokens_seen": 6204592, "step": 18430 }, { "epoch": 14.246522411128284, "grad_norm": 0.8245256543159485, "learning_rate": 1.1583154862853573e-05, "loss": 0.3207, "num_input_tokens_seen": 6206352, "step": 18435 }, { "epoch": 14.250386398763524, "grad_norm": 1.779756784439087, "learning_rate": 1.1568931765229365e-05, "loss": 0.3918, "num_input_tokens_seen": 6207888, "step": 18440 }, { "epoch": 14.254250386398763, "grad_norm": 1.0156524181365967, "learning_rate": 1.1554714776135437e-05, "loss": 0.6117, "num_input_tokens_seen": 6209712, "step": 18445 }, { "epoch": 14.258114374034003, "grad_norm": 1.2906899452209473, "learning_rate": 1.1540503902037744e-05, "loss": 0.5345, "num_input_tokens_seen": 6211696, "step": 18450 }, { "epoch": 14.261978361669243, "grad_norm": 1.2080520391464233, "learning_rate": 1.1526299149399486e-05, "loss": 0.3722, "num_input_tokens_seen": 6213360, "step": 18455 }, { "epoch": 14.265842349304481, "grad_norm": 1.4612834453582764, "learning_rate": 1.1512100524681064e-05, "loss": 0.4788, "num_input_tokens_seen": 6214928, "step": 18460 }, { "epoch": 14.269706336939722, "grad_norm": 0.6210913062095642, "learning_rate": 1.149790803434009e-05, "loss": 0.5615, "num_input_tokens_seen": 6216560, "step": 18465 }, { "epoch": 14.273570324574962, "grad_norm": 0.614508867263794, "learning_rate": 1.1483721684831414e-05, "loss": 0.6245, "num_input_tokens_seen": 6218160, "step": 18470 }, { "epoch": 14.2774343122102, "grad_norm": 1.140392541885376, "learning_rate": 1.146954148260706e-05, "loss": 0.4326, "num_input_tokens_seen": 6220048, "step": 18475 }, { "epoch": 14.28129829984544, "grad_norm": 0.8836193084716797, "learning_rate": 1.145536743411626e-05, "loss": 0.4253, "num_input_tokens_seen": 6221840, "step": 18480 }, { "epoch": 14.28516228748068, "grad_norm": 0.8829614520072937, "learning_rate": 1.1441199545805479e-05, "loss": 0.3246, "num_input_tokens_seen": 6223504, "step": 18485 }, { "epoch": 14.28902627511592, "grad_norm": 0.8625841736793518, "learning_rate": 1.1427037824118342e-05, "loss": 0.5144, "num_input_tokens_seen": 6225104, "step": 18490 }, { "epoch": 14.29289026275116, "grad_norm": 1.7729800939559937, "learning_rate": 1.141288227549569e-05, "loss": 0.6, "num_input_tokens_seen": 6226864, "step": 18495 }, { "epoch": 14.2967542503864, "grad_norm": 0.9871971011161804, "learning_rate": 1.139873290637554e-05, "loss": 0.4237, "num_input_tokens_seen": 6228784, "step": 18500 }, { "epoch": 14.300618238021638, "grad_norm": 0.7276538014411926, "learning_rate": 1.1384589723193126e-05, "loss": 0.3722, "num_input_tokens_seen": 6230192, "step": 18505 }, { "epoch": 14.304482225656878, "grad_norm": 0.8242553472518921, "learning_rate": 1.1370452732380845e-05, "loss": 0.3775, "num_input_tokens_seen": 6231696, "step": 18510 }, { "epoch": 14.308346213292117, "grad_norm": 0.8484584093093872, "learning_rate": 1.135632194036829e-05, "loss": 0.3887, "num_input_tokens_seen": 6233296, "step": 18515 }, { "epoch": 14.312210200927357, "grad_norm": 1.4542189836502075, "learning_rate": 1.1342197353582213e-05, "loss": 0.6157, "num_input_tokens_seen": 6234992, "step": 18520 }, { "epoch": 14.316074188562597, "grad_norm": 0.832855224609375, "learning_rate": 1.1328078978446583e-05, "loss": 0.4078, "num_input_tokens_seen": 6236720, "step": 18525 }, { "epoch": 14.319938176197835, "grad_norm": 1.4645891189575195, "learning_rate": 1.131396682138251e-05, "loss": 0.444, "num_input_tokens_seen": 6238480, "step": 18530 }, { "epoch": 14.323802163833076, "grad_norm": 0.7124123573303223, "learning_rate": 1.129986088880829e-05, "loss": 0.344, "num_input_tokens_seen": 6240176, "step": 18535 }, { "epoch": 14.327666151468316, "grad_norm": 0.7510783672332764, "learning_rate": 1.1285761187139373e-05, "loss": 0.3121, "num_input_tokens_seen": 6241776, "step": 18540 }, { "epoch": 14.331530139103554, "grad_norm": 1.1151450872421265, "learning_rate": 1.1271667722788412e-05, "loss": 0.4005, "num_input_tokens_seen": 6243504, "step": 18545 }, { "epoch": 14.335394126738795, "grad_norm": 1.006447434425354, "learning_rate": 1.1257580502165186e-05, "loss": 0.398, "num_input_tokens_seen": 6245200, "step": 18550 }, { "epoch": 14.339258114374035, "grad_norm": 0.7866183519363403, "learning_rate": 1.1243499531676646e-05, "loss": 0.5007, "num_input_tokens_seen": 6246864, "step": 18555 }, { "epoch": 14.343122102009273, "grad_norm": 1.4932289123535156, "learning_rate": 1.1229424817726897e-05, "loss": 0.6309, "num_input_tokens_seen": 6248624, "step": 18560 }, { "epoch": 14.346986089644513, "grad_norm": 1.0597878694534302, "learning_rate": 1.1215356366717216e-05, "loss": 0.4133, "num_input_tokens_seen": 6250384, "step": 18565 }, { "epoch": 14.350850077279752, "grad_norm": 1.4848861694335938, "learning_rate": 1.1201294185046015e-05, "loss": 0.4732, "num_input_tokens_seen": 6252112, "step": 18570 }, { "epoch": 14.354714064914992, "grad_norm": 0.9380308389663696, "learning_rate": 1.1187238279108844e-05, "loss": 0.5968, "num_input_tokens_seen": 6253904, "step": 18575 }, { "epoch": 14.358578052550232, "grad_norm": 0.685771644115448, "learning_rate": 1.1173188655298436e-05, "loss": 0.4802, "num_input_tokens_seen": 6255536, "step": 18580 }, { "epoch": 14.36244204018547, "grad_norm": 0.7987534403800964, "learning_rate": 1.1159145320004632e-05, "loss": 0.3591, "num_input_tokens_seen": 6257040, "step": 18585 }, { "epoch": 14.36630602782071, "grad_norm": 1.4685001373291016, "learning_rate": 1.1145108279614427e-05, "loss": 0.4329, "num_input_tokens_seen": 6258640, "step": 18590 }, { "epoch": 14.370170015455951, "grad_norm": 1.1765421628952026, "learning_rate": 1.1131077540511952e-05, "loss": 0.3478, "num_input_tokens_seen": 6260368, "step": 18595 }, { "epoch": 14.37403400309119, "grad_norm": 1.0775871276855469, "learning_rate": 1.1117053109078457e-05, "loss": 0.4163, "num_input_tokens_seen": 6262096, "step": 18600 }, { "epoch": 14.37789799072643, "grad_norm": 0.908610999584198, "learning_rate": 1.110303499169236e-05, "loss": 0.4648, "num_input_tokens_seen": 6263696, "step": 18605 }, { "epoch": 14.38176197836167, "grad_norm": 0.8481155037879944, "learning_rate": 1.1089023194729164e-05, "loss": 0.4341, "num_input_tokens_seen": 6265360, "step": 18610 }, { "epoch": 14.385625965996908, "grad_norm": 1.1368142366409302, "learning_rate": 1.107501772456154e-05, "loss": 0.3678, "num_input_tokens_seen": 6267024, "step": 18615 }, { "epoch": 14.389489953632149, "grad_norm": 1.1232986450195312, "learning_rate": 1.106101858755925e-05, "loss": 0.4462, "num_input_tokens_seen": 6268528, "step": 18620 }, { "epoch": 14.393353941267389, "grad_norm": 0.6911967396736145, "learning_rate": 1.104702579008918e-05, "loss": 0.4196, "num_input_tokens_seen": 6270128, "step": 18625 }, { "epoch": 14.397217928902627, "grad_norm": 1.0740904808044434, "learning_rate": 1.1033039338515341e-05, "loss": 0.5015, "num_input_tokens_seen": 6271824, "step": 18630 }, { "epoch": 14.401081916537867, "grad_norm": 1.3923513889312744, "learning_rate": 1.1019059239198859e-05, "loss": 0.4348, "num_input_tokens_seen": 6273392, "step": 18635 }, { "epoch": 14.404945904173106, "grad_norm": 2.587063789367676, "learning_rate": 1.1005085498497952e-05, "loss": 0.5096, "num_input_tokens_seen": 6275120, "step": 18640 }, { "epoch": 14.408809891808346, "grad_norm": 0.7501828670501709, "learning_rate": 1.0991118122767974e-05, "loss": 0.3694, "num_input_tokens_seen": 6276656, "step": 18645 }, { "epoch": 14.412673879443586, "grad_norm": 0.8717324137687683, "learning_rate": 1.0977157118361378e-05, "loss": 0.4598, "num_input_tokens_seen": 6278192, "step": 18650 }, { "epoch": 14.416537867078825, "grad_norm": 1.1660058498382568, "learning_rate": 1.0963202491627703e-05, "loss": 0.5308, "num_input_tokens_seen": 6279728, "step": 18655 }, { "epoch": 14.420401854714065, "grad_norm": 1.2900481224060059, "learning_rate": 1.09492542489136e-05, "loss": 0.7079, "num_input_tokens_seen": 6281328, "step": 18660 }, { "epoch": 14.424265842349305, "grad_norm": 1.3408464193344116, "learning_rate": 1.093531239656281e-05, "loss": 0.4242, "num_input_tokens_seen": 6282800, "step": 18665 }, { "epoch": 14.428129829984544, "grad_norm": 0.8793601989746094, "learning_rate": 1.0921376940916173e-05, "loss": 0.4552, "num_input_tokens_seen": 6284336, "step": 18670 }, { "epoch": 14.431993817619784, "grad_norm": 0.5560921430587769, "learning_rate": 1.0907447888311606e-05, "loss": 0.6299, "num_input_tokens_seen": 6286384, "step": 18675 }, { "epoch": 14.435857805255024, "grad_norm": 0.91298508644104, "learning_rate": 1.0893525245084138e-05, "loss": 0.3654, "num_input_tokens_seen": 6288112, "step": 18680 }, { "epoch": 14.439721792890262, "grad_norm": 0.9709028601646423, "learning_rate": 1.0879609017565879e-05, "loss": 0.3967, "num_input_tokens_seen": 6289680, "step": 18685 }, { "epoch": 14.443585780525503, "grad_norm": 1.001818299293518, "learning_rate": 1.0865699212086e-05, "loss": 0.4611, "num_input_tokens_seen": 6291280, "step": 18690 }, { "epoch": 14.447449768160741, "grad_norm": 1.0484566688537598, "learning_rate": 1.0851795834970767e-05, "loss": 0.3826, "num_input_tokens_seen": 6292720, "step": 18695 }, { "epoch": 14.451313755795981, "grad_norm": 0.9572664499282837, "learning_rate": 1.0837898892543522e-05, "loss": 0.412, "num_input_tokens_seen": 6294352, "step": 18700 }, { "epoch": 14.455177743431221, "grad_norm": 1.172400951385498, "learning_rate": 1.0824008391124669e-05, "loss": 0.3804, "num_input_tokens_seen": 6296112, "step": 18705 }, { "epoch": 14.45904173106646, "grad_norm": 0.7056426405906677, "learning_rate": 1.0810124337031691e-05, "loss": 0.507, "num_input_tokens_seen": 6297744, "step": 18710 }, { "epoch": 14.4629057187017, "grad_norm": 1.4796640872955322, "learning_rate": 1.0796246736579152e-05, "loss": 0.5681, "num_input_tokens_seen": 6299504, "step": 18715 }, { "epoch": 14.46676970633694, "grad_norm": 0.7462121248245239, "learning_rate": 1.078237559607865e-05, "loss": 0.4466, "num_input_tokens_seen": 6301264, "step": 18720 }, { "epoch": 14.470633693972179, "grad_norm": 1.2491655349731445, "learning_rate": 1.0768510921838885e-05, "loss": 0.411, "num_input_tokens_seen": 6302832, "step": 18725 }, { "epoch": 14.474497681607419, "grad_norm": 0.8002195358276367, "learning_rate": 1.0754652720165578e-05, "loss": 0.3469, "num_input_tokens_seen": 6304496, "step": 18730 }, { "epoch": 14.478361669242659, "grad_norm": 0.908942699432373, "learning_rate": 1.0740800997361528e-05, "loss": 0.472, "num_input_tokens_seen": 6306224, "step": 18735 }, { "epoch": 14.482225656877898, "grad_norm": 1.2195185422897339, "learning_rate": 1.0726955759726579e-05, "loss": 0.6985, "num_input_tokens_seen": 6307760, "step": 18740 }, { "epoch": 14.486089644513138, "grad_norm": 0.9356109499931335, "learning_rate": 1.0713117013557618e-05, "loss": 0.6856, "num_input_tokens_seen": 6309584, "step": 18745 }, { "epoch": 14.489953632148378, "grad_norm": 0.8817458748817444, "learning_rate": 1.0699284765148613e-05, "loss": 0.5867, "num_input_tokens_seen": 6311344, "step": 18750 }, { "epoch": 14.493817619783616, "grad_norm": 0.8594443798065186, "learning_rate": 1.0685459020790536e-05, "loss": 0.4127, "num_input_tokens_seen": 6312784, "step": 18755 }, { "epoch": 14.497681607418857, "grad_norm": 1.3442174196243286, "learning_rate": 1.0671639786771415e-05, "loss": 0.8979, "num_input_tokens_seen": 6314384, "step": 18760 }, { "epoch": 14.501545595054095, "grad_norm": 0.8897141218185425, "learning_rate": 1.0657827069376339e-05, "loss": 0.3761, "num_input_tokens_seen": 6315984, "step": 18765 }, { "epoch": 14.505409582689335, "grad_norm": 1.28997802734375, "learning_rate": 1.0644020874887404e-05, "loss": 0.448, "num_input_tokens_seen": 6317744, "step": 18770 }, { "epoch": 14.509273570324575, "grad_norm": 0.6488756537437439, "learning_rate": 1.0630221209583747e-05, "loss": 0.5104, "num_input_tokens_seen": 6319344, "step": 18775 }, { "epoch": 14.513137557959814, "grad_norm": 0.908607006072998, "learning_rate": 1.0616428079741534e-05, "loss": 0.3621, "num_input_tokens_seen": 6320656, "step": 18780 }, { "epoch": 14.517001545595054, "grad_norm": 0.9635596871376038, "learning_rate": 1.0602641491633977e-05, "loss": 0.3984, "num_input_tokens_seen": 6322128, "step": 18785 }, { "epoch": 14.520865533230294, "grad_norm": 0.9188789129257202, "learning_rate": 1.0588861451531293e-05, "loss": 0.3512, "num_input_tokens_seen": 6323888, "step": 18790 }, { "epoch": 14.524729520865533, "grad_norm": 0.6005940437316895, "learning_rate": 1.0575087965700728e-05, "loss": 0.4184, "num_input_tokens_seen": 6325584, "step": 18795 }, { "epoch": 14.528593508500773, "grad_norm": 0.6759510040283203, "learning_rate": 1.0561321040406532e-05, "loss": 0.3654, "num_input_tokens_seen": 6327216, "step": 18800 }, { "epoch": 14.532457496136013, "grad_norm": 1.1764416694641113, "learning_rate": 1.0547560681910008e-05, "loss": 0.3934, "num_input_tokens_seen": 6328752, "step": 18805 }, { "epoch": 14.536321483771252, "grad_norm": 0.8750681281089783, "learning_rate": 1.0533806896469436e-05, "loss": 0.4639, "num_input_tokens_seen": 6330480, "step": 18810 }, { "epoch": 14.540185471406492, "grad_norm": 0.9267116189002991, "learning_rate": 1.0520059690340115e-05, "loss": 0.4325, "num_input_tokens_seen": 6331984, "step": 18815 }, { "epoch": 14.54404945904173, "grad_norm": 1.425931453704834, "learning_rate": 1.050631906977437e-05, "loss": 0.5991, "num_input_tokens_seen": 6333744, "step": 18820 }, { "epoch": 14.54791344667697, "grad_norm": 1.1606501340866089, "learning_rate": 1.0492585041021513e-05, "loss": 0.4488, "num_input_tokens_seen": 6335600, "step": 18825 }, { "epoch": 14.55177743431221, "grad_norm": 1.3904842138290405, "learning_rate": 1.047885761032786e-05, "loss": 0.4753, "num_input_tokens_seen": 6337328, "step": 18830 }, { "epoch": 14.555641421947449, "grad_norm": 0.9359961152076721, "learning_rate": 1.0465136783936732e-05, "loss": 0.4325, "num_input_tokens_seen": 6338960, "step": 18835 }, { "epoch": 14.55950540958269, "grad_norm": 0.939130425453186, "learning_rate": 1.045142256808843e-05, "loss": 0.4052, "num_input_tokens_seen": 6340624, "step": 18840 }, { "epoch": 14.56336939721793, "grad_norm": 1.1880050897598267, "learning_rate": 1.043771496902028e-05, "loss": 0.4095, "num_input_tokens_seen": 6342416, "step": 18845 }, { "epoch": 14.567233384853168, "grad_norm": 0.6576241850852966, "learning_rate": 1.0424013992966564e-05, "loss": 0.3976, "num_input_tokens_seen": 6344144, "step": 18850 }, { "epoch": 14.571097372488408, "grad_norm": 1.1490033864974976, "learning_rate": 1.0410319646158587e-05, "loss": 0.381, "num_input_tokens_seen": 6345776, "step": 18855 }, { "epoch": 14.574961360123648, "grad_norm": 0.6539040207862854, "learning_rate": 1.0396631934824605e-05, "loss": 0.3589, "num_input_tokens_seen": 6347312, "step": 18860 }, { "epoch": 14.578825347758887, "grad_norm": 0.7782238721847534, "learning_rate": 1.0382950865189878e-05, "loss": 0.4797, "num_input_tokens_seen": 6349040, "step": 18865 }, { "epoch": 14.582689335394127, "grad_norm": 0.7582292556762695, "learning_rate": 1.0369276443476636e-05, "loss": 0.3499, "num_input_tokens_seen": 6350512, "step": 18870 }, { "epoch": 14.586553323029367, "grad_norm": 1.513394832611084, "learning_rate": 1.0355608675904086e-05, "loss": 0.6294, "num_input_tokens_seen": 6352400, "step": 18875 }, { "epoch": 14.590417310664606, "grad_norm": 0.8654966354370117, "learning_rate": 1.0341947568688404e-05, "loss": 0.7695, "num_input_tokens_seen": 6354096, "step": 18880 }, { "epoch": 14.594281298299846, "grad_norm": 1.8811745643615723, "learning_rate": 1.0328293128042752e-05, "loss": 0.3994, "num_input_tokens_seen": 6355984, "step": 18885 }, { "epoch": 14.598145285935084, "grad_norm": 0.8837943077087402, "learning_rate": 1.0314645360177258e-05, "loss": 0.525, "num_input_tokens_seen": 6357840, "step": 18890 }, { "epoch": 14.602009273570324, "grad_norm": 1.0783360004425049, "learning_rate": 1.0301004271299003e-05, "loss": 0.38, "num_input_tokens_seen": 6359504, "step": 18895 }, { "epoch": 14.605873261205565, "grad_norm": 0.9702631831169128, "learning_rate": 1.0287369867612032e-05, "loss": 0.4797, "num_input_tokens_seen": 6361104, "step": 18900 }, { "epoch": 14.609737248840803, "grad_norm": 0.7540988326072693, "learning_rate": 1.027374215531736e-05, "loss": 0.3612, "num_input_tokens_seen": 6362736, "step": 18905 }, { "epoch": 14.613601236476043, "grad_norm": 1.103702187538147, "learning_rate": 1.0260121140612944e-05, "loss": 0.3836, "num_input_tokens_seen": 6364368, "step": 18910 }, { "epoch": 14.617465224111283, "grad_norm": 0.7727727890014648, "learning_rate": 1.0246506829693697e-05, "loss": 0.3918, "num_input_tokens_seen": 6365840, "step": 18915 }, { "epoch": 14.621329211746522, "grad_norm": 1.0491223335266113, "learning_rate": 1.0232899228751502e-05, "loss": 0.3905, "num_input_tokens_seen": 6367248, "step": 18920 }, { "epoch": 14.625193199381762, "grad_norm": 0.6052944660186768, "learning_rate": 1.021929834397518e-05, "loss": 0.55, "num_input_tokens_seen": 6368912, "step": 18925 }, { "epoch": 14.629057187017002, "grad_norm": 2.8066494464874268, "learning_rate": 1.0205704181550493e-05, "loss": 0.5629, "num_input_tokens_seen": 6370512, "step": 18930 }, { "epoch": 14.63292117465224, "grad_norm": 0.6652064919471741, "learning_rate": 1.0192116747660144e-05, "loss": 0.3256, "num_input_tokens_seen": 6372240, "step": 18935 }, { "epoch": 14.636785162287481, "grad_norm": 0.8747981786727905, "learning_rate": 1.0178536048483777e-05, "loss": 0.4958, "num_input_tokens_seen": 6373616, "step": 18940 }, { "epoch": 14.64064914992272, "grad_norm": 0.7111654281616211, "learning_rate": 1.0164962090197977e-05, "loss": 0.3698, "num_input_tokens_seen": 6375280, "step": 18945 }, { "epoch": 14.64451313755796, "grad_norm": 0.9682006239891052, "learning_rate": 1.0151394878976256e-05, "loss": 0.4515, "num_input_tokens_seen": 6376976, "step": 18950 }, { "epoch": 14.6483771251932, "grad_norm": 0.9953284859657288, "learning_rate": 1.0137834420989076e-05, "loss": 0.4687, "num_input_tokens_seen": 6378512, "step": 18955 }, { "epoch": 14.652241112828438, "grad_norm": 0.8019205927848816, "learning_rate": 1.0124280722403807e-05, "loss": 0.4019, "num_input_tokens_seen": 6380528, "step": 18960 }, { "epoch": 14.656105100463678, "grad_norm": 0.8470495343208313, "learning_rate": 1.0110733789384744e-05, "loss": 0.4455, "num_input_tokens_seen": 6381936, "step": 18965 }, { "epoch": 14.659969088098919, "grad_norm": 0.7733544707298279, "learning_rate": 1.009719362809313e-05, "loss": 0.3582, "num_input_tokens_seen": 6383344, "step": 18970 }, { "epoch": 14.663833075734157, "grad_norm": 0.9151595830917358, "learning_rate": 1.0083660244687104e-05, "loss": 0.5061, "num_input_tokens_seen": 6385040, "step": 18975 }, { "epoch": 14.667697063369397, "grad_norm": 1.3398231267929077, "learning_rate": 1.0070133645321728e-05, "loss": 0.5156, "num_input_tokens_seen": 6387056, "step": 18980 }, { "epoch": 14.671561051004637, "grad_norm": 1.3708268404006958, "learning_rate": 1.0056613836148976e-05, "loss": 0.3873, "num_input_tokens_seen": 6388688, "step": 18985 }, { "epoch": 14.675425038639876, "grad_norm": 0.7332267761230469, "learning_rate": 1.004310082331775e-05, "loss": 0.3987, "num_input_tokens_seen": 6390224, "step": 18990 }, { "epoch": 14.679289026275116, "grad_norm": 1.7410409450531006, "learning_rate": 1.0029594612973842e-05, "loss": 0.587, "num_input_tokens_seen": 6392112, "step": 18995 }, { "epoch": 14.683153013910356, "grad_norm": 0.8356276154518127, "learning_rate": 1.001609521125996e-05, "loss": 0.4782, "num_input_tokens_seen": 6393616, "step": 19000 }, { "epoch": 14.687017001545595, "grad_norm": 1.0835028886795044, "learning_rate": 1.0002602624315702e-05, "loss": 0.4338, "num_input_tokens_seen": 6395408, "step": 19005 }, { "epoch": 14.690880989180835, "grad_norm": 0.9087915420532227, "learning_rate": 9.989116858277595e-06, "loss": 0.4039, "num_input_tokens_seen": 6397232, "step": 19010 }, { "epoch": 14.694744976816073, "grad_norm": 0.9062017202377319, "learning_rate": 9.975637919279038e-06, "loss": 0.426, "num_input_tokens_seen": 6399056, "step": 19015 }, { "epoch": 14.698608964451314, "grad_norm": 0.5760612487792969, "learning_rate": 9.962165813450322e-06, "loss": 0.3474, "num_input_tokens_seen": 6400560, "step": 19020 }, { "epoch": 14.702472952086554, "grad_norm": 0.764103889465332, "learning_rate": 9.948700546918663e-06, "loss": 0.3074, "num_input_tokens_seen": 6402480, "step": 19025 }, { "epoch": 14.706336939721792, "grad_norm": 0.810163140296936, "learning_rate": 9.935242125808134e-06, "loss": 0.5898, "num_input_tokens_seen": 6404144, "step": 19030 }, { "epoch": 14.710200927357032, "grad_norm": 0.9111660718917847, "learning_rate": 9.921790556239704e-06, "loss": 0.5891, "num_input_tokens_seen": 6406096, "step": 19035 }, { "epoch": 14.714064914992273, "grad_norm": 1.3015002012252808, "learning_rate": 9.90834584433123e-06, "loss": 0.4097, "num_input_tokens_seen": 6407696, "step": 19040 }, { "epoch": 14.717928902627511, "grad_norm": 0.8618611097335815, "learning_rate": 9.894907996197436e-06, "loss": 0.3728, "num_input_tokens_seen": 6409264, "step": 19045 }, { "epoch": 14.721792890262751, "grad_norm": 1.306954026222229, "learning_rate": 9.881477017949959e-06, "loss": 0.5835, "num_input_tokens_seen": 6410800, "step": 19050 }, { "epoch": 14.725656877897991, "grad_norm": 1.101812720298767, "learning_rate": 9.868052915697263e-06, "loss": 0.478, "num_input_tokens_seen": 6412656, "step": 19055 }, { "epoch": 14.72952086553323, "grad_norm": 1.3665223121643066, "learning_rate": 9.854635695544731e-06, "loss": 0.5667, "num_input_tokens_seen": 6414288, "step": 19060 }, { "epoch": 14.73338485316847, "grad_norm": 1.0398021936416626, "learning_rate": 9.84122536359459e-06, "loss": 0.5747, "num_input_tokens_seen": 6416208, "step": 19065 }, { "epoch": 14.737248840803709, "grad_norm": 0.8978924751281738, "learning_rate": 9.827821925945932e-06, "loss": 0.4742, "num_input_tokens_seen": 6417968, "step": 19070 }, { "epoch": 14.741112828438949, "grad_norm": 1.101004958152771, "learning_rate": 9.814425388694728e-06, "loss": 0.3549, "num_input_tokens_seen": 6419696, "step": 19075 }, { "epoch": 14.744976816074189, "grad_norm": 0.9836042523384094, "learning_rate": 9.8010357579338e-06, "loss": 0.3903, "num_input_tokens_seen": 6421616, "step": 19080 }, { "epoch": 14.748840803709427, "grad_norm": 0.9948567748069763, "learning_rate": 9.787653039752819e-06, "loss": 0.4938, "num_input_tokens_seen": 6423344, "step": 19085 }, { "epoch": 14.752704791344668, "grad_norm": 1.785406470298767, "learning_rate": 9.774277240238343e-06, "loss": 0.6118, "num_input_tokens_seen": 6425168, "step": 19090 }, { "epoch": 14.756568778979908, "grad_norm": 1.8193081617355347, "learning_rate": 9.76090836547377e-06, "loss": 0.4788, "num_input_tokens_seen": 6426736, "step": 19095 }, { "epoch": 14.760432766615146, "grad_norm": 1.9054628610610962, "learning_rate": 9.747546421539333e-06, "loss": 0.5737, "num_input_tokens_seen": 6428304, "step": 19100 }, { "epoch": 14.764296754250386, "grad_norm": 1.178361415863037, "learning_rate": 9.734191414512132e-06, "loss": 0.6568, "num_input_tokens_seen": 6430032, "step": 19105 }, { "epoch": 14.768160741885627, "grad_norm": 0.7228152751922607, "learning_rate": 9.720843350466094e-06, "loss": 0.3249, "num_input_tokens_seen": 6431696, "step": 19110 }, { "epoch": 14.772024729520865, "grad_norm": 1.2140834331512451, "learning_rate": 9.707502235472005e-06, "loss": 0.4986, "num_input_tokens_seen": 6433392, "step": 19115 }, { "epoch": 14.775888717156105, "grad_norm": 0.7155811786651611, "learning_rate": 9.694168075597474e-06, "loss": 0.3195, "num_input_tokens_seen": 6434896, "step": 19120 }, { "epoch": 14.779752704791346, "grad_norm": 1.0478417873382568, "learning_rate": 9.680840876906974e-06, "loss": 0.6151, "num_input_tokens_seen": 6436816, "step": 19125 }, { "epoch": 14.783616692426584, "grad_norm": 1.2471566200256348, "learning_rate": 9.667520645461777e-06, "loss": 0.4493, "num_input_tokens_seen": 6438640, "step": 19130 }, { "epoch": 14.787480680061824, "grad_norm": 0.7187371253967285, "learning_rate": 9.654207387320022e-06, "loss": 0.4361, "num_input_tokens_seen": 6440464, "step": 19135 }, { "epoch": 14.791344667697063, "grad_norm": 1.1835806369781494, "learning_rate": 9.64090110853665e-06, "loss": 0.5124, "num_input_tokens_seen": 6442384, "step": 19140 }, { "epoch": 14.795208655332303, "grad_norm": 0.8137050271034241, "learning_rate": 9.627601815163436e-06, "loss": 0.5396, "num_input_tokens_seen": 6444144, "step": 19145 }, { "epoch": 14.799072642967543, "grad_norm": 0.9890490174293518, "learning_rate": 9.614309513248976e-06, "loss": 0.4503, "num_input_tokens_seen": 6446064, "step": 19150 }, { "epoch": 14.802936630602781, "grad_norm": 0.8909057974815369, "learning_rate": 9.601024208838686e-06, "loss": 0.4663, "num_input_tokens_seen": 6447728, "step": 19155 }, { "epoch": 14.806800618238022, "grad_norm": 1.9463821649551392, "learning_rate": 9.587745907974812e-06, "loss": 0.3751, "num_input_tokens_seen": 6449264, "step": 19160 }, { "epoch": 14.810664605873262, "grad_norm": 0.5888672471046448, "learning_rate": 9.574474616696391e-06, "loss": 0.4295, "num_input_tokens_seen": 6450992, "step": 19165 }, { "epoch": 14.8145285935085, "grad_norm": 0.9003983736038208, "learning_rate": 9.561210341039303e-06, "loss": 0.3161, "num_input_tokens_seen": 6452976, "step": 19170 }, { "epoch": 14.81839258114374, "grad_norm": 0.9588744044303894, "learning_rate": 9.547953087036212e-06, "loss": 0.5596, "num_input_tokens_seen": 6454480, "step": 19175 }, { "epoch": 14.82225656877898, "grad_norm": 1.0402594804763794, "learning_rate": 9.534702860716596e-06, "loss": 0.3711, "num_input_tokens_seen": 6456368, "step": 19180 }, { "epoch": 14.826120556414219, "grad_norm": 1.075931429862976, "learning_rate": 9.521459668106736e-06, "loss": 0.4562, "num_input_tokens_seen": 6457872, "step": 19185 }, { "epoch": 14.82998454404946, "grad_norm": 0.5968374609947205, "learning_rate": 9.508223515229709e-06, "loss": 0.3604, "num_input_tokens_seen": 6459344, "step": 19190 }, { "epoch": 14.833848531684698, "grad_norm": 0.8713786005973816, "learning_rate": 9.494994408105412e-06, "loss": 0.4165, "num_input_tokens_seen": 6460944, "step": 19195 }, { "epoch": 14.837712519319938, "grad_norm": 1.3027198314666748, "learning_rate": 9.481772352750513e-06, "loss": 0.56, "num_input_tokens_seen": 6462480, "step": 19200 }, { "epoch": 14.841576506955178, "grad_norm": 0.8037568926811218, "learning_rate": 9.468557355178476e-06, "loss": 0.3616, "num_input_tokens_seen": 6464080, "step": 19205 }, { "epoch": 14.845440494590417, "grad_norm": 0.7815109491348267, "learning_rate": 9.455349421399575e-06, "loss": 0.5139, "num_input_tokens_seen": 6465712, "step": 19210 }, { "epoch": 14.849304482225657, "grad_norm": 1.0081709623336792, "learning_rate": 9.442148557420851e-06, "loss": 0.4099, "num_input_tokens_seen": 6467248, "step": 19215 }, { "epoch": 14.853168469860897, "grad_norm": 0.7670748829841614, "learning_rate": 9.428954769246134e-06, "loss": 0.3603, "num_input_tokens_seen": 6468976, "step": 19220 }, { "epoch": 14.857032457496135, "grad_norm": 0.8778604865074158, "learning_rate": 9.415768062876043e-06, "loss": 0.3857, "num_input_tokens_seen": 6470544, "step": 19225 }, { "epoch": 14.860896445131376, "grad_norm": 0.7216187119483948, "learning_rate": 9.402588444307955e-06, "loss": 0.3796, "num_input_tokens_seen": 6472272, "step": 19230 }, { "epoch": 14.864760432766616, "grad_norm": 1.6328791379928589, "learning_rate": 9.389415919536062e-06, "loss": 0.4592, "num_input_tokens_seen": 6473872, "step": 19235 }, { "epoch": 14.868624420401854, "grad_norm": 0.8699063658714294, "learning_rate": 9.376250494551298e-06, "loss": 0.3424, "num_input_tokens_seen": 6475504, "step": 19240 }, { "epoch": 14.872488408037094, "grad_norm": 0.7680568695068359, "learning_rate": 9.363092175341365e-06, "loss": 0.3782, "num_input_tokens_seen": 6477200, "step": 19245 }, { "epoch": 14.876352395672335, "grad_norm": 1.0937080383300781, "learning_rate": 9.349940967890767e-06, "loss": 0.4324, "num_input_tokens_seen": 6478704, "step": 19250 }, { "epoch": 14.880216383307573, "grad_norm": 0.7665273547172546, "learning_rate": 9.33679687818074e-06, "loss": 0.5674, "num_input_tokens_seen": 6480496, "step": 19255 }, { "epoch": 14.884080370942813, "grad_norm": 1.635043740272522, "learning_rate": 9.323659912189295e-06, "loss": 0.4358, "num_input_tokens_seen": 6482288, "step": 19260 }, { "epoch": 14.887944358578052, "grad_norm": 1.8955389261245728, "learning_rate": 9.310530075891196e-06, "loss": 0.6547, "num_input_tokens_seen": 6484016, "step": 19265 }, { "epoch": 14.891808346213292, "grad_norm": 0.8962358236312866, "learning_rate": 9.29740737525799e-06, "loss": 0.4407, "num_input_tokens_seen": 6486000, "step": 19270 }, { "epoch": 14.895672333848532, "grad_norm": 1.267247200012207, "learning_rate": 9.284291816257947e-06, "loss": 0.6545, "num_input_tokens_seen": 6487600, "step": 19275 }, { "epoch": 14.89953632148377, "grad_norm": 0.7244680523872375, "learning_rate": 9.271183404856104e-06, "loss": 0.4306, "num_input_tokens_seen": 6489456, "step": 19280 }, { "epoch": 14.90340030911901, "grad_norm": 0.9147462844848633, "learning_rate": 9.258082147014236e-06, "loss": 0.6256, "num_input_tokens_seen": 6491248, "step": 19285 }, { "epoch": 14.907264296754251, "grad_norm": 1.104341983795166, "learning_rate": 9.244988048690892e-06, "loss": 0.6087, "num_input_tokens_seen": 6493232, "step": 19290 }, { "epoch": 14.91112828438949, "grad_norm": 0.5912265181541443, "learning_rate": 9.231901115841335e-06, "loss": 0.399, "num_input_tokens_seen": 6495056, "step": 19295 }, { "epoch": 14.91499227202473, "grad_norm": 0.7966699004173279, "learning_rate": 9.218821354417574e-06, "loss": 0.4119, "num_input_tokens_seen": 6496976, "step": 19300 }, { "epoch": 14.91885625965997, "grad_norm": 1.0859403610229492, "learning_rate": 9.205748770368378e-06, "loss": 0.6271, "num_input_tokens_seen": 6498512, "step": 19305 }, { "epoch": 14.922720247295208, "grad_norm": 2.4156906604766846, "learning_rate": 9.19268336963923e-06, "loss": 0.4132, "num_input_tokens_seen": 6500272, "step": 19310 }, { "epoch": 14.926584234930449, "grad_norm": 1.220388650894165, "learning_rate": 9.179625158172354e-06, "loss": 0.5407, "num_input_tokens_seen": 6502160, "step": 19315 }, { "epoch": 14.930448222565687, "grad_norm": 1.482774019241333, "learning_rate": 9.166574141906698e-06, "loss": 0.5094, "num_input_tokens_seen": 6503760, "step": 19320 }, { "epoch": 14.934312210200927, "grad_norm": 0.7747642397880554, "learning_rate": 9.153530326777937e-06, "loss": 0.3893, "num_input_tokens_seen": 6505392, "step": 19325 }, { "epoch": 14.938176197836167, "grad_norm": 0.9906272888183594, "learning_rate": 9.140493718718493e-06, "loss": 0.4181, "num_input_tokens_seen": 6507088, "step": 19330 }, { "epoch": 14.942040185471406, "grad_norm": 0.7637263536453247, "learning_rate": 9.127464323657476e-06, "loss": 0.4062, "num_input_tokens_seen": 6508624, "step": 19335 }, { "epoch": 14.945904173106646, "grad_norm": 0.715268075466156, "learning_rate": 9.114442147520749e-06, "loss": 0.3648, "num_input_tokens_seen": 6510512, "step": 19340 }, { "epoch": 14.949768160741886, "grad_norm": 1.3389148712158203, "learning_rate": 9.101427196230869e-06, "loss": 0.3878, "num_input_tokens_seen": 6512304, "step": 19345 }, { "epoch": 14.953632148377125, "grad_norm": 1.3113404512405396, "learning_rate": 9.088419475707113e-06, "loss": 0.4913, "num_input_tokens_seen": 6514000, "step": 19350 }, { "epoch": 14.957496136012365, "grad_norm": 1.0388420820236206, "learning_rate": 9.07541899186547e-06, "loss": 0.4319, "num_input_tokens_seen": 6515664, "step": 19355 }, { "epoch": 14.961360123647605, "grad_norm": 1.038145661354065, "learning_rate": 9.06242575061864e-06, "loss": 0.6141, "num_input_tokens_seen": 6517584, "step": 19360 }, { "epoch": 14.965224111282843, "grad_norm": 0.7149609923362732, "learning_rate": 9.049439757876013e-06, "loss": 0.4496, "num_input_tokens_seen": 6519280, "step": 19365 }, { "epoch": 14.969088098918084, "grad_norm": 0.8918706178665161, "learning_rate": 9.03646101954371e-06, "loss": 0.376, "num_input_tokens_seen": 6520976, "step": 19370 }, { "epoch": 14.972952086553324, "grad_norm": 0.6685710549354553, "learning_rate": 9.023489541524546e-06, "loss": 0.421, "num_input_tokens_seen": 6522480, "step": 19375 }, { "epoch": 14.976816074188562, "grad_norm": 1.5582813024520874, "learning_rate": 9.010525329718017e-06, "loss": 0.5181, "num_input_tokens_seen": 6524176, "step": 19380 }, { "epoch": 14.980680061823803, "grad_norm": 1.2322003841400146, "learning_rate": 8.997568390020328e-06, "loss": 0.3787, "num_input_tokens_seen": 6525840, "step": 19385 }, { "epoch": 14.984544049459041, "grad_norm": 0.7311603426933289, "learning_rate": 8.984618728324368e-06, "loss": 0.4501, "num_input_tokens_seen": 6527600, "step": 19390 }, { "epoch": 14.988408037094281, "grad_norm": 1.120324730873108, "learning_rate": 8.971676350519723e-06, "loss": 0.597, "num_input_tokens_seen": 6529424, "step": 19395 }, { "epoch": 14.992272024729521, "grad_norm": 1.1426798105239868, "learning_rate": 8.958741262492654e-06, "loss": 0.3852, "num_input_tokens_seen": 6531056, "step": 19400 }, { "epoch": 14.99613601236476, "grad_norm": 0.8049088716506958, "learning_rate": 8.945813470126127e-06, "loss": 0.4754, "num_input_tokens_seen": 6532880, "step": 19405 }, { "epoch": 15.0, "grad_norm": 1.1541895866394043, "learning_rate": 8.932892979299787e-06, "loss": 0.5392, "num_input_tokens_seen": 6534256, "step": 19410 }, { "epoch": 15.0, "eval_loss": 0.4644283652305603, "eval_runtime": 6.371, "eval_samples_per_second": 90.253, "eval_steps_per_second": 22.602, "num_input_tokens_seen": 6534256, "step": 19410 }, { "epoch": 15.00386398763524, "grad_norm": 0.8585046529769897, "learning_rate": 8.919979795889943e-06, "loss": 0.4221, "num_input_tokens_seen": 6535696, "step": 19415 }, { "epoch": 15.007727975270479, "grad_norm": 0.5428534746170044, "learning_rate": 8.907073925769585e-06, "loss": 0.4413, "num_input_tokens_seen": 6537296, "step": 19420 }, { "epoch": 15.011591962905719, "grad_norm": 0.8198378682136536, "learning_rate": 8.894175374808386e-06, "loss": 0.4065, "num_input_tokens_seen": 6538928, "step": 19425 }, { "epoch": 15.015455950540959, "grad_norm": 0.7297564744949341, "learning_rate": 8.881284148872678e-06, "loss": 0.3229, "num_input_tokens_seen": 6540496, "step": 19430 }, { "epoch": 15.019319938176197, "grad_norm": 0.8334338665008545, "learning_rate": 8.868400253825462e-06, "loss": 0.3365, "num_input_tokens_seen": 6542032, "step": 19435 }, { "epoch": 15.023183925811438, "grad_norm": 1.1752837896347046, "learning_rate": 8.855523695526427e-06, "loss": 0.373, "num_input_tokens_seen": 6543728, "step": 19440 }, { "epoch": 15.027047913446676, "grad_norm": 1.3048291206359863, "learning_rate": 8.842654479831895e-06, "loss": 0.5329, "num_input_tokens_seen": 6545392, "step": 19445 }, { "epoch": 15.030911901081916, "grad_norm": 0.9678881764411926, "learning_rate": 8.829792612594873e-06, "loss": 0.39, "num_input_tokens_seen": 6547248, "step": 19450 }, { "epoch": 15.034775888717157, "grad_norm": 1.0546033382415771, "learning_rate": 8.816938099665011e-06, "loss": 0.5812, "num_input_tokens_seen": 6548912, "step": 19455 }, { "epoch": 15.038639876352395, "grad_norm": 0.6634786128997803, "learning_rate": 8.804090946888618e-06, "loss": 0.3676, "num_input_tokens_seen": 6550352, "step": 19460 }, { "epoch": 15.042503863987635, "grad_norm": 0.6987351179122925, "learning_rate": 8.791251160108657e-06, "loss": 0.4051, "num_input_tokens_seen": 6552112, "step": 19465 }, { "epoch": 15.046367851622875, "grad_norm": 0.9345549941062927, "learning_rate": 8.778418745164733e-06, "loss": 0.4335, "num_input_tokens_seen": 6554000, "step": 19470 }, { "epoch": 15.050231839258114, "grad_norm": 1.7414939403533936, "learning_rate": 8.765593707893114e-06, "loss": 0.5904, "num_input_tokens_seen": 6555696, "step": 19475 }, { "epoch": 15.054095826893354, "grad_norm": 0.842167317867279, "learning_rate": 8.752776054126704e-06, "loss": 0.4883, "num_input_tokens_seen": 6557552, "step": 19480 }, { "epoch": 15.057959814528594, "grad_norm": 0.9600721001625061, "learning_rate": 8.739965789695034e-06, "loss": 0.3993, "num_input_tokens_seen": 6559088, "step": 19485 }, { "epoch": 15.061823802163833, "grad_norm": 1.3308857679367065, "learning_rate": 8.727162920424311e-06, "loss": 0.3884, "num_input_tokens_seen": 6560720, "step": 19490 }, { "epoch": 15.065687789799073, "grad_norm": 1.3387045860290527, "learning_rate": 8.714367452137348e-06, "loss": 0.448, "num_input_tokens_seen": 6562800, "step": 19495 }, { "epoch": 15.069551777434313, "grad_norm": 0.7179427146911621, "learning_rate": 8.701579390653595e-06, "loss": 0.3733, "num_input_tokens_seen": 6564368, "step": 19500 }, { "epoch": 15.073415765069551, "grad_norm": 1.0977267026901245, "learning_rate": 8.688798741789136e-06, "loss": 0.4049, "num_input_tokens_seen": 6566000, "step": 19505 }, { "epoch": 15.077279752704792, "grad_norm": 0.6720989942550659, "learning_rate": 8.6760255113567e-06, "loss": 0.4643, "num_input_tokens_seen": 6567728, "step": 19510 }, { "epoch": 15.08114374034003, "grad_norm": 0.9614031910896301, "learning_rate": 8.663259705165625e-06, "loss": 0.4071, "num_input_tokens_seen": 6569648, "step": 19515 }, { "epoch": 15.08500772797527, "grad_norm": 1.0611753463745117, "learning_rate": 8.65050132902187e-06, "loss": 0.4477, "num_input_tokens_seen": 6571472, "step": 19520 }, { "epoch": 15.08887171561051, "grad_norm": 0.9045168161392212, "learning_rate": 8.637750388728016e-06, "loss": 0.4217, "num_input_tokens_seen": 6573200, "step": 19525 }, { "epoch": 15.092735703245749, "grad_norm": 0.7974945306777954, "learning_rate": 8.625006890083284e-06, "loss": 0.3575, "num_input_tokens_seen": 6574832, "step": 19530 }, { "epoch": 15.09659969088099, "grad_norm": 1.2435845136642456, "learning_rate": 8.612270838883484e-06, "loss": 0.34, "num_input_tokens_seen": 6576464, "step": 19535 }, { "epoch": 15.10046367851623, "grad_norm": 0.7872374057769775, "learning_rate": 8.59954224092104e-06, "loss": 0.446, "num_input_tokens_seen": 6578160, "step": 19540 }, { "epoch": 15.104327666151468, "grad_norm": 0.7316175699234009, "learning_rate": 8.586821101985013e-06, "loss": 0.5568, "num_input_tokens_seen": 6579824, "step": 19545 }, { "epoch": 15.108191653786708, "grad_norm": 1.2227485179901123, "learning_rate": 8.574107427861042e-06, "loss": 0.9489, "num_input_tokens_seen": 6581424, "step": 19550 }, { "epoch": 15.112055641421948, "grad_norm": 1.4234325885772705, "learning_rate": 8.561401224331384e-06, "loss": 0.3615, "num_input_tokens_seen": 6583216, "step": 19555 }, { "epoch": 15.115919629057187, "grad_norm": 0.6213921308517456, "learning_rate": 8.548702497174896e-06, "loss": 0.4559, "num_input_tokens_seen": 6584752, "step": 19560 }, { "epoch": 15.119783616692427, "grad_norm": 1.1063095331192017, "learning_rate": 8.536011252167029e-06, "loss": 0.3623, "num_input_tokens_seen": 6586544, "step": 19565 }, { "epoch": 15.123647604327665, "grad_norm": 1.3124445676803589, "learning_rate": 8.523327495079847e-06, "loss": 0.8923, "num_input_tokens_seen": 6588496, "step": 19570 }, { "epoch": 15.127511591962906, "grad_norm": 1.2337939739227295, "learning_rate": 8.51065123168199e-06, "loss": 0.4346, "num_input_tokens_seen": 6589904, "step": 19575 }, { "epoch": 15.131375579598146, "grad_norm": 0.8796578645706177, "learning_rate": 8.497982467738713e-06, "loss": 0.4381, "num_input_tokens_seen": 6591696, "step": 19580 }, { "epoch": 15.135239567233384, "grad_norm": 1.0753329992294312, "learning_rate": 8.485321209011835e-06, "loss": 0.4524, "num_input_tokens_seen": 6593232, "step": 19585 }, { "epoch": 15.139103554868624, "grad_norm": 1.0583795309066772, "learning_rate": 8.472667461259773e-06, "loss": 0.3803, "num_input_tokens_seen": 6595216, "step": 19590 }, { "epoch": 15.142967542503865, "grad_norm": 0.697906494140625, "learning_rate": 8.46002123023753e-06, "loss": 0.4171, "num_input_tokens_seen": 6596912, "step": 19595 }, { "epoch": 15.146831530139103, "grad_norm": 1.5122157335281372, "learning_rate": 8.447382521696683e-06, "loss": 0.5586, "num_input_tokens_seen": 6598576, "step": 19600 }, { "epoch": 15.150695517774343, "grad_norm": 0.8597129583358765, "learning_rate": 8.434751341385388e-06, "loss": 0.4555, "num_input_tokens_seen": 6600208, "step": 19605 }, { "epoch": 15.154559505409583, "grad_norm": 1.0568071603775024, "learning_rate": 8.42212769504839e-06, "loss": 0.3563, "num_input_tokens_seen": 6602064, "step": 19610 }, { "epoch": 15.158423493044822, "grad_norm": 1.7913693189620972, "learning_rate": 8.409511588427002e-06, "loss": 0.6076, "num_input_tokens_seen": 6603600, "step": 19615 }, { "epoch": 15.162287480680062, "grad_norm": 0.6641151905059814, "learning_rate": 8.396903027259103e-06, "loss": 0.5028, "num_input_tokens_seen": 6605296, "step": 19620 }, { "epoch": 15.166151468315302, "grad_norm": 1.316248893737793, "learning_rate": 8.38430201727914e-06, "loss": 0.5181, "num_input_tokens_seen": 6606736, "step": 19625 }, { "epoch": 15.17001545595054, "grad_norm": 0.686500608921051, "learning_rate": 8.371708564218123e-06, "loss": 0.3953, "num_input_tokens_seen": 6608560, "step": 19630 }, { "epoch": 15.173879443585781, "grad_norm": 0.6197354793548584, "learning_rate": 8.359122673803638e-06, "loss": 0.3049, "num_input_tokens_seen": 6610160, "step": 19635 }, { "epoch": 15.17774343122102, "grad_norm": 1.6777136325836182, "learning_rate": 8.346544351759807e-06, "loss": 0.4984, "num_input_tokens_seen": 6612016, "step": 19640 }, { "epoch": 15.18160741885626, "grad_norm": 1.1304715871810913, "learning_rate": 8.333973603807341e-06, "loss": 0.389, "num_input_tokens_seen": 6613744, "step": 19645 }, { "epoch": 15.1854714064915, "grad_norm": 0.775952160358429, "learning_rate": 8.321410435663496e-06, "loss": 0.3834, "num_input_tokens_seen": 6615376, "step": 19650 }, { "epoch": 15.189335394126738, "grad_norm": 0.9804846048355103, "learning_rate": 8.30885485304207e-06, "loss": 0.3559, "num_input_tokens_seen": 6616976, "step": 19655 }, { "epoch": 15.193199381761978, "grad_norm": 1.0342885255813599, "learning_rate": 8.296306861653415e-06, "loss": 0.3956, "num_input_tokens_seen": 6618544, "step": 19660 }, { "epoch": 15.197063369397219, "grad_norm": 0.9454740285873413, "learning_rate": 8.283766467204438e-06, "loss": 0.4365, "num_input_tokens_seen": 6620208, "step": 19665 }, { "epoch": 15.200927357032457, "grad_norm": 0.9869521856307983, "learning_rate": 8.271233675398576e-06, "loss": 0.4215, "num_input_tokens_seen": 6621744, "step": 19670 }, { "epoch": 15.204791344667697, "grad_norm": 0.8766272068023682, "learning_rate": 8.258708491935819e-06, "loss": 0.3479, "num_input_tokens_seen": 6623568, "step": 19675 }, { "epoch": 15.208655332302937, "grad_norm": 0.7737771272659302, "learning_rate": 8.246190922512704e-06, "loss": 0.3657, "num_input_tokens_seen": 6625168, "step": 19680 }, { "epoch": 15.212519319938176, "grad_norm": 0.6999582648277283, "learning_rate": 8.233680972822286e-06, "loss": 0.3626, "num_input_tokens_seen": 6626608, "step": 19685 }, { "epoch": 15.216383307573416, "grad_norm": 0.5296787619590759, "learning_rate": 8.221178648554178e-06, "loss": 0.3463, "num_input_tokens_seen": 6628240, "step": 19690 }, { "epoch": 15.220247295208654, "grad_norm": 1.4996238946914673, "learning_rate": 8.208683955394506e-06, "loss": 0.6219, "num_input_tokens_seen": 6629872, "step": 19695 }, { "epoch": 15.224111282843895, "grad_norm": 1.4344666004180908, "learning_rate": 8.196196899025929e-06, "loss": 0.6674, "num_input_tokens_seen": 6631664, "step": 19700 }, { "epoch": 15.227975270479135, "grad_norm": 1.30551016330719, "learning_rate": 8.18371748512764e-06, "loss": 0.5119, "num_input_tokens_seen": 6633328, "step": 19705 }, { "epoch": 15.231839258114373, "grad_norm": 0.9803833365440369, "learning_rate": 8.171245719375337e-06, "loss": 0.4243, "num_input_tokens_seen": 6634800, "step": 19710 }, { "epoch": 15.235703245749614, "grad_norm": 0.864404022693634, "learning_rate": 8.15878160744127e-06, "loss": 0.547, "num_input_tokens_seen": 6636688, "step": 19715 }, { "epoch": 15.239567233384854, "grad_norm": 0.5906625986099243, "learning_rate": 8.146325154994189e-06, "loss": 0.4355, "num_input_tokens_seen": 6638096, "step": 19720 }, { "epoch": 15.243431221020092, "grad_norm": 1.6049965620040894, "learning_rate": 8.133876367699353e-06, "loss": 0.5791, "num_input_tokens_seen": 6639760, "step": 19725 }, { "epoch": 15.247295208655332, "grad_norm": 1.151047945022583, "learning_rate": 8.12143525121856e-06, "loss": 0.5583, "num_input_tokens_seen": 6641392, "step": 19730 }, { "epoch": 15.251159196290573, "grad_norm": 0.6062384843826294, "learning_rate": 8.109001811210093e-06, "loss": 0.3108, "num_input_tokens_seen": 6643024, "step": 19735 }, { "epoch": 15.255023183925811, "grad_norm": 0.8313153982162476, "learning_rate": 8.096576053328761e-06, "loss": 0.4813, "num_input_tokens_seen": 6644720, "step": 19740 }, { "epoch": 15.258887171561051, "grad_norm": 0.9421367049217224, "learning_rate": 8.084157983225862e-06, "loss": 0.3878, "num_input_tokens_seen": 6646448, "step": 19745 }, { "epoch": 15.262751159196291, "grad_norm": 0.9857114553451538, "learning_rate": 8.071747606549226e-06, "loss": 0.3941, "num_input_tokens_seen": 6647888, "step": 19750 }, { "epoch": 15.26661514683153, "grad_norm": 1.1031286716461182, "learning_rate": 8.059344928943157e-06, "loss": 0.4213, "num_input_tokens_seen": 6649808, "step": 19755 }, { "epoch": 15.27047913446677, "grad_norm": 1.072481632232666, "learning_rate": 8.04694995604847e-06, "loss": 0.4584, "num_input_tokens_seen": 6651632, "step": 19760 }, { "epoch": 15.274343122102009, "grad_norm": 0.5400086045265198, "learning_rate": 8.03456269350246e-06, "loss": 0.4637, "num_input_tokens_seen": 6653232, "step": 19765 }, { "epoch": 15.278207109737249, "grad_norm": 1.4076733589172363, "learning_rate": 8.02218314693895e-06, "loss": 0.5337, "num_input_tokens_seen": 6654992, "step": 19770 }, { "epoch": 15.282071097372489, "grad_norm": 0.8103440999984741, "learning_rate": 8.009811321988217e-06, "loss": 0.3757, "num_input_tokens_seen": 6656432, "step": 19775 }, { "epoch": 15.285935085007727, "grad_norm": 1.2467656135559082, "learning_rate": 7.99744722427704e-06, "loss": 0.3676, "num_input_tokens_seen": 6658160, "step": 19780 }, { "epoch": 15.289799072642968, "grad_norm": 0.8030087351799011, "learning_rate": 7.985090859428695e-06, "loss": 0.3833, "num_input_tokens_seen": 6660208, "step": 19785 }, { "epoch": 15.293663060278208, "grad_norm": 0.9123138189315796, "learning_rate": 7.97274223306293e-06, "loss": 0.3552, "num_input_tokens_seen": 6661712, "step": 19790 }, { "epoch": 15.297527047913446, "grad_norm": 1.048828125, "learning_rate": 7.960401350795965e-06, "loss": 0.5688, "num_input_tokens_seen": 6663536, "step": 19795 }, { "epoch": 15.301391035548686, "grad_norm": 0.6540021896362305, "learning_rate": 7.948068218240514e-06, "loss": 0.3738, "num_input_tokens_seen": 6665296, "step": 19800 }, { "epoch": 15.305255023183927, "grad_norm": 0.8190595507621765, "learning_rate": 7.93574284100575e-06, "loss": 0.4209, "num_input_tokens_seen": 6666768, "step": 19805 }, { "epoch": 15.309119010819165, "grad_norm": 0.7411916255950928, "learning_rate": 7.923425224697342e-06, "loss": 0.4149, "num_input_tokens_seen": 6668272, "step": 19810 }, { "epoch": 15.312982998454405, "grad_norm": 1.040999174118042, "learning_rate": 7.911115374917402e-06, "loss": 0.5895, "num_input_tokens_seen": 6670064, "step": 19815 }, { "epoch": 15.316846986089644, "grad_norm": 0.8639894127845764, "learning_rate": 7.89881329726454e-06, "loss": 0.4812, "num_input_tokens_seen": 6671536, "step": 19820 }, { "epoch": 15.320710973724884, "grad_norm": 0.853743851184845, "learning_rate": 7.886518997333805e-06, "loss": 0.3662, "num_input_tokens_seen": 6673360, "step": 19825 }, { "epoch": 15.324574961360124, "grad_norm": 1.1813263893127441, "learning_rate": 7.874232480716718e-06, "loss": 0.5551, "num_input_tokens_seen": 6674800, "step": 19830 }, { "epoch": 15.328438948995363, "grad_norm": 1.1309483051300049, "learning_rate": 7.861953753001262e-06, "loss": 0.4588, "num_input_tokens_seen": 6676944, "step": 19835 }, { "epoch": 15.332302936630603, "grad_norm": 1.483406901359558, "learning_rate": 7.849682819771872e-06, "loss": 0.5253, "num_input_tokens_seen": 6678576, "step": 19840 }, { "epoch": 15.336166924265843, "grad_norm": 1.248818278312683, "learning_rate": 7.83741968660944e-06, "loss": 0.4398, "num_input_tokens_seen": 6680240, "step": 19845 }, { "epoch": 15.340030911901081, "grad_norm": 1.8967547416687012, "learning_rate": 7.825164359091323e-06, "loss": 0.4862, "num_input_tokens_seen": 6681968, "step": 19850 }, { "epoch": 15.343894899536322, "grad_norm": 0.5230904817581177, "learning_rate": 7.812916842791304e-06, "loss": 0.4566, "num_input_tokens_seen": 6683760, "step": 19855 }, { "epoch": 15.347758887171562, "grad_norm": 0.8730406165122986, "learning_rate": 7.800677143279645e-06, "loss": 0.4009, "num_input_tokens_seen": 6685520, "step": 19860 }, { "epoch": 15.3516228748068, "grad_norm": 1.37865149974823, "learning_rate": 7.78844526612302e-06, "loss": 0.5138, "num_input_tokens_seen": 6687120, "step": 19865 }, { "epoch": 15.35548686244204, "grad_norm": 1.1457626819610596, "learning_rate": 7.776221216884566e-06, "loss": 0.325, "num_input_tokens_seen": 6688624, "step": 19870 }, { "epoch": 15.35935085007728, "grad_norm": 1.0298930406570435, "learning_rate": 7.764005001123851e-06, "loss": 0.3575, "num_input_tokens_seen": 6690128, "step": 19875 }, { "epoch": 15.363214837712519, "grad_norm": 1.3116297721862793, "learning_rate": 7.751796624396876e-06, "loss": 0.6313, "num_input_tokens_seen": 6692080, "step": 19880 }, { "epoch": 15.36707882534776, "grad_norm": 0.8468073606491089, "learning_rate": 7.7395960922561e-06, "loss": 0.3702, "num_input_tokens_seen": 6694128, "step": 19885 }, { "epoch": 15.370942812982998, "grad_norm": 1.0045945644378662, "learning_rate": 7.72740341025038e-06, "loss": 0.3771, "num_input_tokens_seen": 6695792, "step": 19890 }, { "epoch": 15.374806800618238, "grad_norm": 1.0495363473892212, "learning_rate": 7.71521858392504e-06, "loss": 0.5415, "num_input_tokens_seen": 6697424, "step": 19895 }, { "epoch": 15.378670788253478, "grad_norm": 0.9513674974441528, "learning_rate": 7.703041618821805e-06, "loss": 0.4757, "num_input_tokens_seen": 6699024, "step": 19900 }, { "epoch": 15.382534775888717, "grad_norm": 0.7436115741729736, "learning_rate": 7.690872520478825e-06, "loss": 0.3498, "num_input_tokens_seen": 6700816, "step": 19905 }, { "epoch": 15.386398763523957, "grad_norm": 1.416408896446228, "learning_rate": 7.678711294430685e-06, "loss": 0.7129, "num_input_tokens_seen": 6702320, "step": 19910 }, { "epoch": 15.390262751159197, "grad_norm": 1.3339927196502686, "learning_rate": 7.666557946208375e-06, "loss": 0.591, "num_input_tokens_seen": 6703952, "step": 19915 }, { "epoch": 15.394126738794435, "grad_norm": 1.0088797807693481, "learning_rate": 7.654412481339324e-06, "loss": 0.4846, "num_input_tokens_seen": 6705904, "step": 19920 }, { "epoch": 15.397990726429676, "grad_norm": 1.5412225723266602, "learning_rate": 7.642274905347353e-06, "loss": 0.4129, "num_input_tokens_seen": 6707408, "step": 19925 }, { "epoch": 15.401854714064916, "grad_norm": 1.13643217086792, "learning_rate": 7.630145223752699e-06, "loss": 0.4536, "num_input_tokens_seen": 6709168, "step": 19930 }, { "epoch": 15.405718701700154, "grad_norm": 1.1180908679962158, "learning_rate": 7.618023442072031e-06, "loss": 0.4117, "num_input_tokens_seen": 6710864, "step": 19935 }, { "epoch": 15.409582689335394, "grad_norm": 2.683293104171753, "learning_rate": 7.6059095658183975e-06, "loss": 0.6951, "num_input_tokens_seen": 6712624, "step": 19940 }, { "epoch": 15.413446676970633, "grad_norm": 0.8628725409507751, "learning_rate": 7.593803600501262e-06, "loss": 0.3982, "num_input_tokens_seen": 6714384, "step": 19945 }, { "epoch": 15.417310664605873, "grad_norm": 1.0932214260101318, "learning_rate": 7.581705551626489e-06, "loss": 0.5813, "num_input_tokens_seen": 6715888, "step": 19950 }, { "epoch": 15.421174652241113, "grad_norm": 1.1581001281738281, "learning_rate": 7.569615424696341e-06, "loss": 0.4775, "num_input_tokens_seen": 6717520, "step": 19955 }, { "epoch": 15.425038639876352, "grad_norm": 0.9332171678543091, "learning_rate": 7.55753322520949e-06, "loss": 0.5174, "num_input_tokens_seen": 6719248, "step": 19960 }, { "epoch": 15.428902627511592, "grad_norm": 0.9451935291290283, "learning_rate": 7.545458958660987e-06, "loss": 0.5043, "num_input_tokens_seen": 6720944, "step": 19965 }, { "epoch": 15.432766615146832, "grad_norm": 1.2107911109924316, "learning_rate": 7.533392630542272e-06, "loss": 0.3529, "num_input_tokens_seen": 6722736, "step": 19970 }, { "epoch": 15.43663060278207, "grad_norm": 0.6382949948310852, "learning_rate": 7.521334246341202e-06, "loss": 0.375, "num_input_tokens_seen": 6724464, "step": 19975 }, { "epoch": 15.44049459041731, "grad_norm": 1.05666983127594, "learning_rate": 7.509283811541992e-06, "loss": 0.5721, "num_input_tokens_seen": 6726064, "step": 19980 }, { "epoch": 15.444358578052551, "grad_norm": 1.59944748878479, "learning_rate": 7.497241331625252e-06, "loss": 0.4211, "num_input_tokens_seen": 6727696, "step": 19985 }, { "epoch": 15.44822256568779, "grad_norm": 1.0049386024475098, "learning_rate": 7.4852068120679656e-06, "loss": 0.3402, "num_input_tokens_seen": 6729328, "step": 19990 }, { "epoch": 15.45208655332303, "grad_norm": 0.8767766356468201, "learning_rate": 7.473180258343521e-06, "loss": 0.6121, "num_input_tokens_seen": 6730960, "step": 19995 }, { "epoch": 15.45595054095827, "grad_norm": 1.5592352151870728, "learning_rate": 7.46116167592166e-06, "loss": 0.5069, "num_input_tokens_seen": 6732720, "step": 20000 }, { "epoch": 15.459814528593508, "grad_norm": 1.021582007408142, "learning_rate": 7.449151070268504e-06, "loss": 0.4553, "num_input_tokens_seen": 6734416, "step": 20005 }, { "epoch": 15.463678516228748, "grad_norm": 1.0271811485290527, "learning_rate": 7.43714844684654e-06, "loss": 0.7674, "num_input_tokens_seen": 6736144, "step": 20010 }, { "epoch": 15.467542503863987, "grad_norm": 1.1399328708648682, "learning_rate": 7.425153811114652e-06, "loss": 0.3401, "num_input_tokens_seen": 6737776, "step": 20015 }, { "epoch": 15.471406491499227, "grad_norm": 1.2545220851898193, "learning_rate": 7.413167168528062e-06, "loss": 0.3039, "num_input_tokens_seen": 6739568, "step": 20020 }, { "epoch": 15.475270479134467, "grad_norm": 0.9854077100753784, "learning_rate": 7.4011885245383604e-06, "loss": 0.5265, "num_input_tokens_seen": 6741168, "step": 20025 }, { "epoch": 15.479134466769706, "grad_norm": 0.6788809299468994, "learning_rate": 7.389217884593519e-06, "loss": 0.5885, "num_input_tokens_seen": 6742864, "step": 20030 }, { "epoch": 15.482998454404946, "grad_norm": 0.6270418763160706, "learning_rate": 7.377255254137852e-06, "loss": 0.4618, "num_input_tokens_seen": 6744432, "step": 20035 }, { "epoch": 15.486862442040186, "grad_norm": 0.9439524412155151, "learning_rate": 7.3653006386120326e-06, "loss": 0.4172, "num_input_tokens_seen": 6746064, "step": 20040 }, { "epoch": 15.490726429675425, "grad_norm": 1.6093595027923584, "learning_rate": 7.353354043453092e-06, "loss": 0.5391, "num_input_tokens_seen": 6747568, "step": 20045 }, { "epoch": 15.494590417310665, "grad_norm": 1.0463651418685913, "learning_rate": 7.341415474094407e-06, "loss": 0.3806, "num_input_tokens_seen": 6749424, "step": 20050 }, { "epoch": 15.498454404945905, "grad_norm": 1.8125889301300049, "learning_rate": 7.329484935965728e-06, "loss": 0.5089, "num_input_tokens_seen": 6750960, "step": 20055 }, { "epoch": 15.502318392581143, "grad_norm": 1.4374240636825562, "learning_rate": 7.317562434493114e-06, "loss": 0.3944, "num_input_tokens_seen": 6752496, "step": 20060 }, { "epoch": 15.506182380216384, "grad_norm": 0.8161766529083252, "learning_rate": 7.305647975099009e-06, "loss": 0.5066, "num_input_tokens_seen": 6754352, "step": 20065 }, { "epoch": 15.510046367851622, "grad_norm": 1.3140524625778198, "learning_rate": 7.293741563202172e-06, "loss": 0.4485, "num_input_tokens_seen": 6756144, "step": 20070 }, { "epoch": 15.513910355486862, "grad_norm": 0.7920746803283691, "learning_rate": 7.281843204217711e-06, "loss": 0.3608, "num_input_tokens_seen": 6757904, "step": 20075 }, { "epoch": 15.517774343122102, "grad_norm": 1.3862755298614502, "learning_rate": 7.2699529035570705e-06, "loss": 0.619, "num_input_tokens_seen": 6759760, "step": 20080 }, { "epoch": 15.521638330757341, "grad_norm": 0.719479501247406, "learning_rate": 7.258070666628031e-06, "loss": 0.3704, "num_input_tokens_seen": 6761232, "step": 20085 }, { "epoch": 15.525502318392581, "grad_norm": 2.4246206283569336, "learning_rate": 7.246196498834695e-06, "loss": 0.8325, "num_input_tokens_seen": 6762704, "step": 20090 }, { "epoch": 15.529366306027821, "grad_norm": 0.9710299372673035, "learning_rate": 7.234330405577516e-06, "loss": 0.4743, "num_input_tokens_seen": 6764464, "step": 20095 }, { "epoch": 15.53323029366306, "grad_norm": 0.7781848907470703, "learning_rate": 7.2224723922532735e-06, "loss": 0.3478, "num_input_tokens_seen": 6766160, "step": 20100 }, { "epoch": 15.5370942812983, "grad_norm": 1.0298304557800293, "learning_rate": 7.210622464255049e-06, "loss": 0.4164, "num_input_tokens_seen": 6767664, "step": 20105 }, { "epoch": 15.54095826893354, "grad_norm": 1.212698221206665, "learning_rate": 7.198780626972265e-06, "loss": 0.5156, "num_input_tokens_seen": 6769168, "step": 20110 }, { "epoch": 15.544822256568779, "grad_norm": 0.7093024849891663, "learning_rate": 7.18694688579066e-06, "loss": 0.3589, "num_input_tokens_seen": 6770896, "step": 20115 }, { "epoch": 15.548686244204019, "grad_norm": 1.3238636255264282, "learning_rate": 7.17512124609229e-06, "loss": 0.4019, "num_input_tokens_seen": 6772368, "step": 20120 }, { "epoch": 15.552550231839259, "grad_norm": 0.85969477891922, "learning_rate": 7.163303713255515e-06, "loss": 0.501, "num_input_tokens_seen": 6774096, "step": 20125 }, { "epoch": 15.556414219474497, "grad_norm": 0.6163071393966675, "learning_rate": 7.1514942926550335e-06, "loss": 0.4126, "num_input_tokens_seen": 6775824, "step": 20130 }, { "epoch": 15.560278207109738, "grad_norm": 0.8568507432937622, "learning_rate": 7.139692989661845e-06, "loss": 0.5017, "num_input_tokens_seen": 6777520, "step": 20135 }, { "epoch": 15.564142194744976, "grad_norm": 1.3406310081481934, "learning_rate": 7.127899809643248e-06, "loss": 0.3763, "num_input_tokens_seen": 6779088, "step": 20140 }, { "epoch": 15.568006182380216, "grad_norm": 1.1190627813339233, "learning_rate": 7.1161147579628465e-06, "loss": 0.3384, "num_input_tokens_seen": 6780720, "step": 20145 }, { "epoch": 15.571870170015456, "grad_norm": 0.9957669973373413, "learning_rate": 7.10433783998056e-06, "loss": 0.4621, "num_input_tokens_seen": 6782832, "step": 20150 }, { "epoch": 15.575734157650695, "grad_norm": 0.6272773742675781, "learning_rate": 7.092569061052592e-06, "loss": 0.3324, "num_input_tokens_seen": 6784560, "step": 20155 }, { "epoch": 15.579598145285935, "grad_norm": 0.8692940473556519, "learning_rate": 7.080808426531455e-06, "loss": 0.5249, "num_input_tokens_seen": 6786096, "step": 20160 }, { "epoch": 15.583462132921175, "grad_norm": 0.6885052919387817, "learning_rate": 7.069055941765962e-06, "loss": 0.4361, "num_input_tokens_seen": 6788016, "step": 20165 }, { "epoch": 15.587326120556414, "grad_norm": 1.4149996042251587, "learning_rate": 7.0573116121012056e-06, "loss": 0.3964, "num_input_tokens_seen": 6789680, "step": 20170 }, { "epoch": 15.591190108191654, "grad_norm": 1.022096037864685, "learning_rate": 7.0455754428785904e-06, "loss": 0.4252, "num_input_tokens_seen": 6791216, "step": 20175 }, { "epoch": 15.595054095826894, "grad_norm": 1.3831850290298462, "learning_rate": 7.033847439435789e-06, "loss": 0.4049, "num_input_tokens_seen": 6792944, "step": 20180 }, { "epoch": 15.598918083462133, "grad_norm": 0.7929952144622803, "learning_rate": 7.0221276071067685e-06, "loss": 0.402, "num_input_tokens_seen": 6794832, "step": 20185 }, { "epoch": 15.602782071097373, "grad_norm": 0.9588839411735535, "learning_rate": 7.010415951221777e-06, "loss": 0.3528, "num_input_tokens_seen": 6796624, "step": 20190 }, { "epoch": 15.606646058732611, "grad_norm": 0.8940603137016296, "learning_rate": 6.998712477107336e-06, "loss": 0.4171, "num_input_tokens_seen": 6798448, "step": 20195 }, { "epoch": 15.610510046367851, "grad_norm": 1.7120908498764038, "learning_rate": 6.9870171900862755e-06, "loss": 0.4403, "num_input_tokens_seen": 6800080, "step": 20200 }, { "epoch": 15.614374034003092, "grad_norm": 1.3770595788955688, "learning_rate": 6.975330095477673e-06, "loss": 0.5623, "num_input_tokens_seen": 6801744, "step": 20205 }, { "epoch": 15.61823802163833, "grad_norm": 1.175826072692871, "learning_rate": 6.96365119859688e-06, "loss": 0.4612, "num_input_tokens_seen": 6803568, "step": 20210 }, { "epoch": 15.62210200927357, "grad_norm": 1.006251573562622, "learning_rate": 6.951980504755545e-06, "loss": 0.3327, "num_input_tokens_seen": 6805072, "step": 20215 }, { "epoch": 15.62596599690881, "grad_norm": 1.4025914669036865, "learning_rate": 6.940318019261563e-06, "loss": 0.6148, "num_input_tokens_seen": 6807184, "step": 20220 }, { "epoch": 15.629829984544049, "grad_norm": 0.5934174060821533, "learning_rate": 6.928663747419098e-06, "loss": 0.3396, "num_input_tokens_seen": 6808880, "step": 20225 }, { "epoch": 15.63369397217929, "grad_norm": 1.5922369956970215, "learning_rate": 6.91701769452858e-06, "loss": 0.3848, "num_input_tokens_seen": 6810672, "step": 20230 }, { "epoch": 15.63755795981453, "grad_norm": 1.3939121961593628, "learning_rate": 6.905379865886718e-06, "loss": 0.3753, "num_input_tokens_seen": 6812688, "step": 20235 }, { "epoch": 15.641421947449768, "grad_norm": 0.8070642352104187, "learning_rate": 6.8937502667864555e-06, "loss": 0.6608, "num_input_tokens_seen": 6814256, "step": 20240 }, { "epoch": 15.645285935085008, "grad_norm": 1.0011824369430542, "learning_rate": 6.8821289025170075e-06, "loss": 0.3597, "num_input_tokens_seen": 6815984, "step": 20245 }, { "epoch": 15.649149922720248, "grad_norm": 1.0737686157226562, "learning_rate": 6.8705157783638286e-06, "loss": 0.4146, "num_input_tokens_seen": 6817680, "step": 20250 }, { "epoch": 15.653013910355487, "grad_norm": 1.5799425840377808, "learning_rate": 6.858910899608656e-06, "loss": 0.4598, "num_input_tokens_seen": 6819376, "step": 20255 }, { "epoch": 15.656877897990727, "grad_norm": 0.9262670874595642, "learning_rate": 6.847314271529448e-06, "loss": 0.4219, "num_input_tokens_seen": 6821008, "step": 20260 }, { "epoch": 15.660741885625965, "grad_norm": 0.6970418095588684, "learning_rate": 6.835725899400417e-06, "loss": 0.3488, "num_input_tokens_seen": 6822608, "step": 20265 }, { "epoch": 15.664605873261205, "grad_norm": 0.7761216163635254, "learning_rate": 6.824145788492031e-06, "loss": 0.4613, "num_input_tokens_seen": 6824176, "step": 20270 }, { "epoch": 15.668469860896446, "grad_norm": 2.238663673400879, "learning_rate": 6.812573944070996e-06, "loss": 0.5787, "num_input_tokens_seen": 6826096, "step": 20275 }, { "epoch": 15.672333848531684, "grad_norm": 1.2683886289596558, "learning_rate": 6.801010371400249e-06, "loss": 0.3966, "num_input_tokens_seen": 6827888, "step": 20280 }, { "epoch": 15.676197836166924, "grad_norm": 1.003556728363037, "learning_rate": 6.789455075738973e-06, "loss": 0.4301, "num_input_tokens_seen": 6829616, "step": 20285 }, { "epoch": 15.680061823802165, "grad_norm": 0.7790185213088989, "learning_rate": 6.777908062342583e-06, "loss": 0.3579, "num_input_tokens_seen": 6831344, "step": 20290 }, { "epoch": 15.683925811437403, "grad_norm": 0.7541612982749939, "learning_rate": 6.766369336462742e-06, "loss": 0.5666, "num_input_tokens_seen": 6832944, "step": 20295 }, { "epoch": 15.687789799072643, "grad_norm": 1.171766757965088, "learning_rate": 6.7548389033473135e-06, "loss": 0.443, "num_input_tokens_seen": 6834448, "step": 20300 }, { "epoch": 15.691653786707883, "grad_norm": 2.268692970275879, "learning_rate": 6.743316768240426e-06, "loss": 0.581, "num_input_tokens_seen": 6836080, "step": 20305 }, { "epoch": 15.695517774343122, "grad_norm": 0.974099338054657, "learning_rate": 6.731802936382408e-06, "loss": 0.4581, "num_input_tokens_seen": 6837616, "step": 20310 }, { "epoch": 15.699381761978362, "grad_norm": 1.651658535003662, "learning_rate": 6.7202974130098185e-06, "loss": 0.4406, "num_input_tokens_seen": 6839280, "step": 20315 }, { "epoch": 15.7032457496136, "grad_norm": 1.1711153984069824, "learning_rate": 6.708800203355436e-06, "loss": 0.4648, "num_input_tokens_seen": 6840944, "step": 20320 }, { "epoch": 15.70710973724884, "grad_norm": 1.0684055089950562, "learning_rate": 6.697311312648266e-06, "loss": 0.7605, "num_input_tokens_seen": 6842448, "step": 20325 }, { "epoch": 15.71097372488408, "grad_norm": 1.0724377632141113, "learning_rate": 6.685830746113511e-06, "loss": 0.9133, "num_input_tokens_seen": 6844144, "step": 20330 }, { "epoch": 15.71483771251932, "grad_norm": 0.8285989761352539, "learning_rate": 6.674358508972614e-06, "loss": 0.3531, "num_input_tokens_seen": 6845872, "step": 20335 }, { "epoch": 15.71870170015456, "grad_norm": 1.0449466705322266, "learning_rate": 6.662894606443224e-06, "loss": 0.4315, "num_input_tokens_seen": 6847504, "step": 20340 }, { "epoch": 15.7225656877898, "grad_norm": 0.7430775761604309, "learning_rate": 6.65143904373918e-06, "loss": 0.4175, "num_input_tokens_seen": 6849136, "step": 20345 }, { "epoch": 15.726429675425038, "grad_norm": 1.0385138988494873, "learning_rate": 6.6399918260705466e-06, "loss": 0.3943, "num_input_tokens_seen": 6850704, "step": 20350 }, { "epoch": 15.730293663060278, "grad_norm": 1.0106481313705444, "learning_rate": 6.628552958643583e-06, "loss": 0.4481, "num_input_tokens_seen": 6852400, "step": 20355 }, { "epoch": 15.734157650695519, "grad_norm": 1.025590419769287, "learning_rate": 6.617122446660756e-06, "loss": 0.3831, "num_input_tokens_seen": 6853840, "step": 20360 }, { "epoch": 15.738021638330757, "grad_norm": 0.932803213596344, "learning_rate": 6.605700295320724e-06, "loss": 0.4147, "num_input_tokens_seen": 6855280, "step": 20365 }, { "epoch": 15.741885625965997, "grad_norm": 1.0307923555374146, "learning_rate": 6.594286509818359e-06, "loss": 0.4801, "num_input_tokens_seen": 6856880, "step": 20370 }, { "epoch": 15.745749613601237, "grad_norm": 1.2566391229629517, "learning_rate": 6.582881095344723e-06, "loss": 0.4117, "num_input_tokens_seen": 6858448, "step": 20375 }, { "epoch": 15.749613601236476, "grad_norm": 0.9740511775016785, "learning_rate": 6.57148405708706e-06, "loss": 0.5256, "num_input_tokens_seen": 6860208, "step": 20380 }, { "epoch": 15.753477588871716, "grad_norm": 0.7917287945747375, "learning_rate": 6.560095400228811e-06, "loss": 0.3906, "num_input_tokens_seen": 6861872, "step": 20385 }, { "epoch": 15.757341576506954, "grad_norm": 1.6718075275421143, "learning_rate": 6.548715129949607e-06, "loss": 0.4361, "num_input_tokens_seen": 6863536, "step": 20390 }, { "epoch": 15.761205564142195, "grad_norm": 1.053144931793213, "learning_rate": 6.537343251425263e-06, "loss": 0.3626, "num_input_tokens_seen": 6865392, "step": 20395 }, { "epoch": 15.765069551777435, "grad_norm": 1.2165452241897583, "learning_rate": 6.525979769827769e-06, "loss": 0.4158, "num_input_tokens_seen": 6867024, "step": 20400 }, { "epoch": 15.768933539412673, "grad_norm": 1.1865204572677612, "learning_rate": 6.514624690325319e-06, "loss": 0.4126, "num_input_tokens_seen": 6868816, "step": 20405 }, { "epoch": 15.772797527047913, "grad_norm": 0.8711721897125244, "learning_rate": 6.503278018082257e-06, "loss": 0.4556, "num_input_tokens_seen": 6870352, "step": 20410 }, { "epoch": 15.776661514683154, "grad_norm": 1.0023608207702637, "learning_rate": 6.491939758259133e-06, "loss": 0.391, "num_input_tokens_seen": 6871984, "step": 20415 }, { "epoch": 15.780525502318392, "grad_norm": 0.8917621970176697, "learning_rate": 6.480609916012647e-06, "loss": 0.4295, "num_input_tokens_seen": 6873488, "step": 20420 }, { "epoch": 15.784389489953632, "grad_norm": 0.811878502368927, "learning_rate": 6.469288496495682e-06, "loss": 0.4624, "num_input_tokens_seen": 6875120, "step": 20425 }, { "epoch": 15.788253477588873, "grad_norm": 0.6995303630828857, "learning_rate": 6.45797550485728e-06, "loss": 0.4036, "num_input_tokens_seen": 6876752, "step": 20430 }, { "epoch": 15.792117465224111, "grad_norm": 0.9631667137145996, "learning_rate": 6.446670946242659e-06, "loss": 0.4404, "num_input_tokens_seen": 6878352, "step": 20435 }, { "epoch": 15.795981452859351, "grad_norm": 0.8157589435577393, "learning_rate": 6.435374825793208e-06, "loss": 0.444, "num_input_tokens_seen": 6879856, "step": 20440 }, { "epoch": 15.79984544049459, "grad_norm": 0.9523016810417175, "learning_rate": 6.424087148646468e-06, "loss": 0.3882, "num_input_tokens_seen": 6881520, "step": 20445 }, { "epoch": 15.80370942812983, "grad_norm": 1.4109598398208618, "learning_rate": 6.412807919936128e-06, "loss": 0.4798, "num_input_tokens_seen": 6883152, "step": 20450 }, { "epoch": 15.80757341576507, "grad_norm": 0.8152108192443848, "learning_rate": 6.401537144792072e-06, "loss": 0.4094, "num_input_tokens_seen": 6884752, "step": 20455 }, { "epoch": 15.811437403400308, "grad_norm": 1.555547833442688, "learning_rate": 6.390274828340303e-06, "loss": 0.4337, "num_input_tokens_seen": 6886160, "step": 20460 }, { "epoch": 15.815301391035549, "grad_norm": 1.0828452110290527, "learning_rate": 6.37902097570299e-06, "loss": 0.4967, "num_input_tokens_seen": 6887792, "step": 20465 }, { "epoch": 15.819165378670789, "grad_norm": 0.7450079321861267, "learning_rate": 6.367775591998448e-06, "loss": 0.3715, "num_input_tokens_seen": 6889552, "step": 20470 }, { "epoch": 15.823029366306027, "grad_norm": 1.095799207687378, "learning_rate": 6.3565386823411565e-06, "loss": 0.4206, "num_input_tokens_seen": 6891280, "step": 20475 }, { "epoch": 15.826893353941268, "grad_norm": 1.511885404586792, "learning_rate": 6.345310251841727e-06, "loss": 0.4943, "num_input_tokens_seen": 6892848, "step": 20480 }, { "epoch": 15.830757341576508, "grad_norm": 0.8762086033821106, "learning_rate": 6.33409030560691e-06, "loss": 0.3661, "num_input_tokens_seen": 6894480, "step": 20485 }, { "epoch": 15.834621329211746, "grad_norm": 0.9926679134368896, "learning_rate": 6.3228788487396025e-06, "loss": 0.6552, "num_input_tokens_seen": 6895984, "step": 20490 }, { "epoch": 15.838485316846986, "grad_norm": 2.1736629009246826, "learning_rate": 6.311675886338852e-06, "loss": 0.4797, "num_input_tokens_seen": 6897552, "step": 20495 }, { "epoch": 15.842349304482227, "grad_norm": 0.9146932363510132, "learning_rate": 6.3004814234998326e-06, "loss": 0.3703, "num_input_tokens_seen": 6899280, "step": 20500 }, { "epoch": 15.846213292117465, "grad_norm": 1.8694775104522705, "learning_rate": 6.2892954653138384e-06, "loss": 0.5386, "num_input_tokens_seen": 6900944, "step": 20505 }, { "epoch": 15.850077279752705, "grad_norm": 0.8501408100128174, "learning_rate": 6.278118016868328e-06, "loss": 0.54, "num_input_tokens_seen": 6902640, "step": 20510 }, { "epoch": 15.853941267387944, "grad_norm": 0.7622623443603516, "learning_rate": 6.266949083246867e-06, "loss": 0.6271, "num_input_tokens_seen": 6904464, "step": 20515 }, { "epoch": 15.857805255023184, "grad_norm": 1.0317447185516357, "learning_rate": 6.255788669529147e-06, "loss": 0.5631, "num_input_tokens_seen": 6906032, "step": 20520 }, { "epoch": 15.861669242658424, "grad_norm": 0.8483880758285522, "learning_rate": 6.2446367807909995e-06, "loss": 0.4274, "num_input_tokens_seen": 6907568, "step": 20525 }, { "epoch": 15.865533230293662, "grad_norm": 1.3501616716384888, "learning_rate": 6.233493422104356e-06, "loss": 0.3898, "num_input_tokens_seen": 6909136, "step": 20530 }, { "epoch": 15.869397217928903, "grad_norm": 1.1770105361938477, "learning_rate": 6.2223585985372974e-06, "loss": 0.3554, "num_input_tokens_seen": 6910544, "step": 20535 }, { "epoch": 15.873261205564143, "grad_norm": 0.6952727437019348, "learning_rate": 6.211232315153998e-06, "loss": 0.4862, "num_input_tokens_seen": 6912240, "step": 20540 }, { "epoch": 15.877125193199381, "grad_norm": 1.5207178592681885, "learning_rate": 6.2001145770147705e-06, "loss": 0.5087, "num_input_tokens_seen": 6914064, "step": 20545 }, { "epoch": 15.880989180834622, "grad_norm": 1.1903716325759888, "learning_rate": 6.18900538917602e-06, "loss": 0.406, "num_input_tokens_seen": 6916016, "step": 20550 }, { "epoch": 15.884853168469862, "grad_norm": 1.171249270439148, "learning_rate": 6.177904756690276e-06, "loss": 0.3842, "num_input_tokens_seen": 6917680, "step": 20555 }, { "epoch": 15.8887171561051, "grad_norm": 0.657439112663269, "learning_rate": 6.166812684606165e-06, "loss": 0.4727, "num_input_tokens_seen": 6919504, "step": 20560 }, { "epoch": 15.89258114374034, "grad_norm": 1.0239189863204956, "learning_rate": 6.155729177968436e-06, "loss": 0.4175, "num_input_tokens_seen": 6921136, "step": 20565 }, { "epoch": 15.896445131375579, "grad_norm": 0.9233490824699402, "learning_rate": 6.144654241817924e-06, "loss": 0.4828, "num_input_tokens_seen": 6922736, "step": 20570 }, { "epoch": 15.900309119010819, "grad_norm": 0.9767964482307434, "learning_rate": 6.133587881191591e-06, "loss": 0.4051, "num_input_tokens_seen": 6924400, "step": 20575 }, { "epoch": 15.90417310664606, "grad_norm": 1.3612534999847412, "learning_rate": 6.122530101122464e-06, "loss": 0.5941, "num_input_tokens_seen": 6926096, "step": 20580 }, { "epoch": 15.908037094281298, "grad_norm": 0.623346209526062, "learning_rate": 6.111480906639713e-06, "loss": 0.3897, "num_input_tokens_seen": 6927760, "step": 20585 }, { "epoch": 15.911901081916538, "grad_norm": 1.0274662971496582, "learning_rate": 6.100440302768562e-06, "loss": 0.4308, "num_input_tokens_seen": 6929424, "step": 20590 }, { "epoch": 15.915765069551778, "grad_norm": 0.7322568893432617, "learning_rate": 6.089408294530344e-06, "loss": 0.4313, "num_input_tokens_seen": 6931184, "step": 20595 }, { "epoch": 15.919629057187016, "grad_norm": 1.2961291074752808, "learning_rate": 6.078384886942487e-06, "loss": 0.4105, "num_input_tokens_seen": 6932848, "step": 20600 }, { "epoch": 15.923493044822257, "grad_norm": 1.2109509706497192, "learning_rate": 6.067370085018495e-06, "loss": 0.4463, "num_input_tokens_seen": 6934448, "step": 20605 }, { "epoch": 15.927357032457497, "grad_norm": 0.841347336769104, "learning_rate": 6.056363893767975e-06, "loss": 0.374, "num_input_tokens_seen": 6936368, "step": 20610 }, { "epoch": 15.931221020092735, "grad_norm": 0.7064265012741089, "learning_rate": 6.0453663181965995e-06, "loss": 0.4023, "num_input_tokens_seen": 6938224, "step": 20615 }, { "epoch": 15.935085007727976, "grad_norm": 0.9110126495361328, "learning_rate": 6.034377363306146e-06, "loss": 0.502, "num_input_tokens_seen": 6939984, "step": 20620 }, { "epoch": 15.938948995363216, "grad_norm": 0.840764582157135, "learning_rate": 6.0233970340944465e-06, "loss": 0.3954, "num_input_tokens_seen": 6941616, "step": 20625 }, { "epoch": 15.942812982998454, "grad_norm": 1.3313477039337158, "learning_rate": 6.012425335555422e-06, "loss": 0.4599, "num_input_tokens_seen": 6943344, "step": 20630 }, { "epoch": 15.946676970633694, "grad_norm": 0.6651255488395691, "learning_rate": 6.0014622726790676e-06, "loss": 0.3176, "num_input_tokens_seen": 6945040, "step": 20635 }, { "epoch": 15.950540958268933, "grad_norm": 0.896440863609314, "learning_rate": 5.990507850451443e-06, "loss": 0.3895, "num_input_tokens_seen": 6947184, "step": 20640 }, { "epoch": 15.954404945904173, "grad_norm": 0.985559344291687, "learning_rate": 5.9795620738547e-06, "loss": 0.3668, "num_input_tokens_seen": 6949040, "step": 20645 }, { "epoch": 15.958268933539413, "grad_norm": 1.0227863788604736, "learning_rate": 5.9686249478670245e-06, "loss": 0.4149, "num_input_tokens_seen": 6950928, "step": 20650 }, { "epoch": 15.962132921174652, "grad_norm": 0.8931452631950378, "learning_rate": 5.957696477462704e-06, "loss": 0.4144, "num_input_tokens_seen": 6952496, "step": 20655 }, { "epoch": 15.965996908809892, "grad_norm": 1.4366459846496582, "learning_rate": 5.9467766676120666e-06, "loss": 0.4195, "num_input_tokens_seen": 6954384, "step": 20660 }, { "epoch": 15.969860896445132, "grad_norm": 1.3334777355194092, "learning_rate": 5.935865523281509e-06, "loss": 0.5554, "num_input_tokens_seen": 6955952, "step": 20665 }, { "epoch": 15.97372488408037, "grad_norm": 1.248913049697876, "learning_rate": 5.924963049433477e-06, "loss": 0.3866, "num_input_tokens_seen": 6957584, "step": 20670 }, { "epoch": 15.97758887171561, "grad_norm": 1.0578244924545288, "learning_rate": 5.914069251026489e-06, "loss": 0.4787, "num_input_tokens_seen": 6959440, "step": 20675 }, { "epoch": 15.98145285935085, "grad_norm": 0.6607025265693665, "learning_rate": 5.9031841330151e-06, "loss": 0.3897, "num_input_tokens_seen": 6961328, "step": 20680 }, { "epoch": 15.98531684698609, "grad_norm": 0.6198793649673462, "learning_rate": 5.892307700349939e-06, "loss": 0.6466, "num_input_tokens_seen": 6962896, "step": 20685 }, { "epoch": 15.98918083462133, "grad_norm": 1.0195258855819702, "learning_rate": 5.881439957977661e-06, "loss": 0.555, "num_input_tokens_seen": 6964528, "step": 20690 }, { "epoch": 15.993044822256568, "grad_norm": 1.2175244092941284, "learning_rate": 5.870580910840995e-06, "loss": 0.4079, "num_input_tokens_seen": 6966128, "step": 20695 }, { "epoch": 15.996908809891808, "grad_norm": 0.6497359275817871, "learning_rate": 5.85973056387869e-06, "loss": 0.4023, "num_input_tokens_seen": 6967728, "step": 20700 }, { "epoch": 16.0, "eval_loss": 0.46426212787628174, "eval_runtime": 6.3687, "eval_samples_per_second": 90.286, "eval_steps_per_second": 22.611, "num_input_tokens_seen": 6968992, "step": 20704 }, { "epoch": 16.00077279752705, "grad_norm": 0.8457368612289429, "learning_rate": 5.848888922025553e-06, "loss": 0.3553, "num_input_tokens_seen": 6969312, "step": 20705 }, { "epoch": 16.004636785162287, "grad_norm": 0.6846472024917603, "learning_rate": 5.838055990212424e-06, "loss": 0.8347, "num_input_tokens_seen": 6970944, "step": 20710 }, { "epoch": 16.00850077279753, "grad_norm": 0.7418405413627625, "learning_rate": 5.8272317733661815e-06, "loss": 0.4053, "num_input_tokens_seen": 6972608, "step": 20715 }, { "epoch": 16.012364760432767, "grad_norm": 1.4522273540496826, "learning_rate": 5.816416276409756e-06, "loss": 0.3594, "num_input_tokens_seen": 6974112, "step": 20720 }, { "epoch": 16.016228748068006, "grad_norm": 0.5865864157676697, "learning_rate": 5.805609504262094e-06, "loss": 0.4199, "num_input_tokens_seen": 6975968, "step": 20725 }, { "epoch": 16.020092735703244, "grad_norm": 1.0583598613739014, "learning_rate": 5.794811461838173e-06, "loss": 0.356, "num_input_tokens_seen": 6977568, "step": 20730 }, { "epoch": 16.023956723338486, "grad_norm": 2.6787478923797607, "learning_rate": 5.7840221540490234e-06, "loss": 0.5654, "num_input_tokens_seen": 6979232, "step": 20735 }, { "epoch": 16.027820710973725, "grad_norm": 0.7802587151527405, "learning_rate": 5.773241585801676e-06, "loss": 0.3962, "num_input_tokens_seen": 6980928, "step": 20740 }, { "epoch": 16.031684698608963, "grad_norm": 0.7459185123443604, "learning_rate": 5.762469761999201e-06, "loss": 0.3735, "num_input_tokens_seen": 6982496, "step": 20745 }, { "epoch": 16.035548686244205, "grad_norm": 1.2260847091674805, "learning_rate": 5.751706687540679e-06, "loss": 0.5502, "num_input_tokens_seen": 6984000, "step": 20750 }, { "epoch": 16.039412673879443, "grad_norm": 1.6226511001586914, "learning_rate": 5.740952367321237e-06, "loss": 0.4645, "num_input_tokens_seen": 6985696, "step": 20755 }, { "epoch": 16.043276661514682, "grad_norm": 0.8611540198326111, "learning_rate": 5.7302068062319965e-06, "loss": 0.5309, "num_input_tokens_seen": 6987520, "step": 20760 }, { "epoch": 16.047140649149924, "grad_norm": 1.465469479560852, "learning_rate": 5.719470009160102e-06, "loss": 0.4441, "num_input_tokens_seen": 6989152, "step": 20765 }, { "epoch": 16.051004636785162, "grad_norm": 0.7157455086708069, "learning_rate": 5.708741980988708e-06, "loss": 0.4618, "num_input_tokens_seen": 6990720, "step": 20770 }, { "epoch": 16.0548686244204, "grad_norm": 0.9015302658081055, "learning_rate": 5.698022726596996e-06, "loss": 0.4297, "num_input_tokens_seen": 6992352, "step": 20775 }, { "epoch": 16.058732612055643, "grad_norm": 0.91490238904953, "learning_rate": 5.687312250860147e-06, "loss": 0.3923, "num_input_tokens_seen": 6994208, "step": 20780 }, { "epoch": 16.06259659969088, "grad_norm": 0.7462961077690125, "learning_rate": 5.676610558649337e-06, "loss": 0.479, "num_input_tokens_seen": 6995776, "step": 20785 }, { "epoch": 16.06646058732612, "grad_norm": 0.5933719873428345, "learning_rate": 5.665917654831773e-06, "loss": 0.5355, "num_input_tokens_seen": 6997632, "step": 20790 }, { "epoch": 16.07032457496136, "grad_norm": 1.3930634260177612, "learning_rate": 5.655233544270649e-06, "loss": 0.6205, "num_input_tokens_seen": 6999776, "step": 20795 }, { "epoch": 16.0741885625966, "grad_norm": 0.8231056928634644, "learning_rate": 5.644558231825162e-06, "loss": 0.3418, "num_input_tokens_seen": 7001568, "step": 20800 }, { "epoch": 16.07805255023184, "grad_norm": 1.1014039516448975, "learning_rate": 5.633891722350504e-06, "loss": 0.4178, "num_input_tokens_seen": 7003552, "step": 20805 }, { "epoch": 16.08191653786708, "grad_norm": 1.3818126916885376, "learning_rate": 5.623234020697868e-06, "loss": 0.5799, "num_input_tokens_seen": 7005408, "step": 20810 }, { "epoch": 16.08578052550232, "grad_norm": 0.9148237705230713, "learning_rate": 5.612585131714437e-06, "loss": 0.3875, "num_input_tokens_seen": 7007168, "step": 20815 }, { "epoch": 16.089644513137557, "grad_norm": 1.0278093814849854, "learning_rate": 5.601945060243397e-06, "loss": 0.4277, "num_input_tokens_seen": 7008704, "step": 20820 }, { "epoch": 16.0935085007728, "grad_norm": 1.2803335189819336, "learning_rate": 5.591313811123919e-06, "loss": 0.4831, "num_input_tokens_seen": 7010496, "step": 20825 }, { "epoch": 16.097372488408038, "grad_norm": 0.9694924354553223, "learning_rate": 5.580691389191153e-06, "loss": 0.3854, "num_input_tokens_seen": 7012256, "step": 20830 }, { "epoch": 16.101236476043276, "grad_norm": 1.4029755592346191, "learning_rate": 5.570077799276241e-06, "loss": 0.3976, "num_input_tokens_seen": 7013824, "step": 20835 }, { "epoch": 16.105100463678518, "grad_norm": 1.2643673419952393, "learning_rate": 5.559473046206309e-06, "loss": 0.488, "num_input_tokens_seen": 7015360, "step": 20840 }, { "epoch": 16.108964451313756, "grad_norm": 0.9120920896530151, "learning_rate": 5.548877134804459e-06, "loss": 0.3558, "num_input_tokens_seen": 7016992, "step": 20845 }, { "epoch": 16.112828438948995, "grad_norm": 1.1122541427612305, "learning_rate": 5.538290069889768e-06, "loss": 0.5589, "num_input_tokens_seen": 7018880, "step": 20850 }, { "epoch": 16.116692426584233, "grad_norm": 0.7994176149368286, "learning_rate": 5.527711856277307e-06, "loss": 0.5014, "num_input_tokens_seen": 7020352, "step": 20855 }, { "epoch": 16.120556414219475, "grad_norm": 0.7225633859634399, "learning_rate": 5.5171424987781165e-06, "loss": 0.4364, "num_input_tokens_seen": 7021920, "step": 20860 }, { "epoch": 16.124420401854714, "grad_norm": 1.08596932888031, "learning_rate": 5.506582002199193e-06, "loss": 0.3509, "num_input_tokens_seen": 7023616, "step": 20865 }, { "epoch": 16.128284389489952, "grad_norm": 0.7417502999305725, "learning_rate": 5.496030371343519e-06, "loss": 0.3703, "num_input_tokens_seen": 7025376, "step": 20870 }, { "epoch": 16.132148377125194, "grad_norm": 0.7791482210159302, "learning_rate": 5.485487611010034e-06, "loss": 0.559, "num_input_tokens_seen": 7026944, "step": 20875 }, { "epoch": 16.136012364760433, "grad_norm": 0.6352646350860596, "learning_rate": 5.474953725993653e-06, "loss": 0.4474, "num_input_tokens_seen": 7028704, "step": 20880 }, { "epoch": 16.13987635239567, "grad_norm": 0.8879584670066833, "learning_rate": 5.46442872108524e-06, "loss": 0.3381, "num_input_tokens_seen": 7030400, "step": 20885 }, { "epoch": 16.143740340030913, "grad_norm": 0.8465701937675476, "learning_rate": 5.453912601071648e-06, "loss": 0.5032, "num_input_tokens_seen": 7032288, "step": 20890 }, { "epoch": 16.14760432766615, "grad_norm": 0.6943076848983765, "learning_rate": 5.443405370735655e-06, "loss": 0.3545, "num_input_tokens_seen": 7034208, "step": 20895 }, { "epoch": 16.15146831530139, "grad_norm": 1.1450964212417603, "learning_rate": 5.432907034856024e-06, "loss": 0.4806, "num_input_tokens_seen": 7035872, "step": 20900 }, { "epoch": 16.155332302936632, "grad_norm": 0.7848669290542603, "learning_rate": 5.4224175982074575e-06, "loss": 0.4243, "num_input_tokens_seen": 7037632, "step": 20905 }, { "epoch": 16.15919629057187, "grad_norm": 0.7990866899490356, "learning_rate": 5.411937065560613e-06, "loss": 0.509, "num_input_tokens_seen": 7039168, "step": 20910 }, { "epoch": 16.16306027820711, "grad_norm": 1.5807245969772339, "learning_rate": 5.401465441682099e-06, "loss": 0.3884, "num_input_tokens_seen": 7040800, "step": 20915 }, { "epoch": 16.16692426584235, "grad_norm": 2.3653934001922607, "learning_rate": 5.391002731334466e-06, "loss": 0.4645, "num_input_tokens_seen": 7042528, "step": 20920 }, { "epoch": 16.17078825347759, "grad_norm": 1.0009605884552002, "learning_rate": 5.380548939276231e-06, "loss": 0.3879, "num_input_tokens_seen": 7044448, "step": 20925 }, { "epoch": 16.174652241112828, "grad_norm": 0.9667382836341858, "learning_rate": 5.370104070261836e-06, "loss": 0.4392, "num_input_tokens_seen": 7046208, "step": 20930 }, { "epoch": 16.17851622874807, "grad_norm": 1.883257269859314, "learning_rate": 5.359668129041662e-06, "loss": 0.5193, "num_input_tokens_seen": 7047936, "step": 20935 }, { "epoch": 16.182380216383308, "grad_norm": 0.5564941763877869, "learning_rate": 5.34924112036205e-06, "loss": 0.3645, "num_input_tokens_seen": 7049664, "step": 20940 }, { "epoch": 16.186244204018546, "grad_norm": 0.8596094846725464, "learning_rate": 5.338823048965261e-06, "loss": 0.4073, "num_input_tokens_seen": 7051360, "step": 20945 }, { "epoch": 16.19010819165379, "grad_norm": 1.5955246686935425, "learning_rate": 5.3284139195894924e-06, "loss": 0.3885, "num_input_tokens_seen": 7053248, "step": 20950 }, { "epoch": 16.193972179289027, "grad_norm": 0.9144430756568909, "learning_rate": 5.318013736968877e-06, "loss": 0.3913, "num_input_tokens_seen": 7054976, "step": 20955 }, { "epoch": 16.197836166924265, "grad_norm": 2.05533504486084, "learning_rate": 5.307622505833493e-06, "loss": 0.5947, "num_input_tokens_seen": 7057056, "step": 20960 }, { "epoch": 16.201700154559504, "grad_norm": 1.0034539699554443, "learning_rate": 5.297240230909326e-06, "loss": 0.4899, "num_input_tokens_seen": 7058720, "step": 20965 }, { "epoch": 16.205564142194746, "grad_norm": 1.1113572120666504, "learning_rate": 5.2868669169182955e-06, "loss": 0.3359, "num_input_tokens_seen": 7060576, "step": 20970 }, { "epoch": 16.209428129829984, "grad_norm": 1.3067538738250732, "learning_rate": 5.2765025685782425e-06, "loss": 0.4065, "num_input_tokens_seen": 7062368, "step": 20975 }, { "epoch": 16.213292117465222, "grad_norm": 0.9111545085906982, "learning_rate": 5.266147190602949e-06, "loss": 0.5244, "num_input_tokens_seen": 7064032, "step": 20980 }, { "epoch": 16.217156105100464, "grad_norm": 0.7785804867744446, "learning_rate": 5.255800787702095e-06, "loss": 0.3859, "num_input_tokens_seen": 7065568, "step": 20985 }, { "epoch": 16.221020092735703, "grad_norm": 1.2911524772644043, "learning_rate": 5.245463364581277e-06, "loss": 0.3952, "num_input_tokens_seen": 7067200, "step": 20990 }, { "epoch": 16.22488408037094, "grad_norm": 1.2736873626708984, "learning_rate": 5.235134925942034e-06, "loss": 0.5249, "num_input_tokens_seen": 7068768, "step": 20995 }, { "epoch": 16.228748068006183, "grad_norm": 0.9808948040008545, "learning_rate": 5.2248154764817925e-06, "loss": 0.4161, "num_input_tokens_seen": 7070464, "step": 21000 }, { "epoch": 16.23261205564142, "grad_norm": 0.5941346287727356, "learning_rate": 5.214505020893903e-06, "loss": 0.337, "num_input_tokens_seen": 7072192, "step": 21005 }, { "epoch": 16.23647604327666, "grad_norm": 0.7030501365661621, "learning_rate": 5.204203563867619e-06, "loss": 0.4733, "num_input_tokens_seen": 7073856, "step": 21010 }, { "epoch": 16.240340030911902, "grad_norm": 1.2838335037231445, "learning_rate": 5.193911110088101e-06, "loss": 0.4299, "num_input_tokens_seen": 7075328, "step": 21015 }, { "epoch": 16.24420401854714, "grad_norm": 1.1849806308746338, "learning_rate": 5.183627664236429e-06, "loss": 0.439, "num_input_tokens_seen": 7077216, "step": 21020 }, { "epoch": 16.24806800618238, "grad_norm": 1.1950299739837646, "learning_rate": 5.173353230989567e-06, "loss": 0.5003, "num_input_tokens_seen": 7078784, "step": 21025 }, { "epoch": 16.25193199381762, "grad_norm": 0.510340690612793, "learning_rate": 5.163087815020398e-06, "loss": 0.3566, "num_input_tokens_seen": 7080448, "step": 21030 }, { "epoch": 16.25579598145286, "grad_norm": 1.358418583869934, "learning_rate": 5.152831420997689e-06, "loss": 0.3759, "num_input_tokens_seen": 7082112, "step": 21035 }, { "epoch": 16.259659969088098, "grad_norm": 0.7663089036941528, "learning_rate": 5.1425840535861106e-06, "loss": 0.5425, "num_input_tokens_seen": 7084160, "step": 21040 }, { "epoch": 16.26352395672334, "grad_norm": 1.5598978996276855, "learning_rate": 5.132345717446227e-06, "loss": 0.7344, "num_input_tokens_seen": 7085696, "step": 21045 }, { "epoch": 16.26738794435858, "grad_norm": 0.6278266310691833, "learning_rate": 5.12211641723449e-06, "loss": 0.4093, "num_input_tokens_seen": 7087168, "step": 21050 }, { "epoch": 16.271251931993817, "grad_norm": 1.0083799362182617, "learning_rate": 5.111896157603246e-06, "loss": 0.4871, "num_input_tokens_seen": 7088832, "step": 21055 }, { "epoch": 16.27511591962906, "grad_norm": 1.3448559045791626, "learning_rate": 5.101684943200735e-06, "loss": 0.5198, "num_input_tokens_seen": 7090688, "step": 21060 }, { "epoch": 16.278979907264297, "grad_norm": 0.8112397193908691, "learning_rate": 5.091482778671086e-06, "loss": 0.3282, "num_input_tokens_seen": 7092256, "step": 21065 }, { "epoch": 16.282843894899536, "grad_norm": 1.3930472135543823, "learning_rate": 5.081289668654296e-06, "loss": 0.5499, "num_input_tokens_seen": 7094144, "step": 21070 }, { "epoch": 16.286707882534778, "grad_norm": 1.1894900798797607, "learning_rate": 5.071105617786251e-06, "loss": 0.4362, "num_input_tokens_seen": 7095776, "step": 21075 }, { "epoch": 16.290571870170016, "grad_norm": 0.5949539542198181, "learning_rate": 5.060930630698724e-06, "loss": 0.5422, "num_input_tokens_seen": 7097376, "step": 21080 }, { "epoch": 16.294435857805254, "grad_norm": 0.9562528729438782, "learning_rate": 5.050764712019354e-06, "loss": 0.5027, "num_input_tokens_seen": 7098912, "step": 21085 }, { "epoch": 16.298299845440496, "grad_norm": 0.7318028211593628, "learning_rate": 5.040607866371658e-06, "loss": 0.3209, "num_input_tokens_seen": 7100448, "step": 21090 }, { "epoch": 16.302163833075735, "grad_norm": 1.0749706029891968, "learning_rate": 5.030460098375037e-06, "loss": 0.5237, "num_input_tokens_seen": 7102080, "step": 21095 }, { "epoch": 16.306027820710973, "grad_norm": 1.0668022632598877, "learning_rate": 5.0203214126447625e-06, "loss": 0.5121, "num_input_tokens_seen": 7103712, "step": 21100 }, { "epoch": 16.30989180834621, "grad_norm": 0.9536466598510742, "learning_rate": 5.010191813791962e-06, "loss": 0.5078, "num_input_tokens_seen": 7105344, "step": 21105 }, { "epoch": 16.313755795981454, "grad_norm": 1.2623090744018555, "learning_rate": 5.00007130642364e-06, "loss": 0.4392, "num_input_tokens_seen": 7106688, "step": 21110 }, { "epoch": 16.317619783616692, "grad_norm": 0.6014572381973267, "learning_rate": 4.989959895142663e-06, "loss": 0.3442, "num_input_tokens_seen": 7108480, "step": 21115 }, { "epoch": 16.32148377125193, "grad_norm": 0.75556480884552, "learning_rate": 4.979857584547762e-06, "loss": 0.4457, "num_input_tokens_seen": 7110176, "step": 21120 }, { "epoch": 16.325347758887172, "grad_norm": 1.2751411199569702, "learning_rate": 4.969764379233518e-06, "loss": 0.3819, "num_input_tokens_seen": 7112000, "step": 21125 }, { "epoch": 16.32921174652241, "grad_norm": 1.0044575929641724, "learning_rate": 4.959680283790399e-06, "loss": 0.4443, "num_input_tokens_seen": 7113856, "step": 21130 }, { "epoch": 16.33307573415765, "grad_norm": 0.9741230607032776, "learning_rate": 4.9496053028046965e-06, "loss": 0.3977, "num_input_tokens_seen": 7115296, "step": 21135 }, { "epoch": 16.33693972179289, "grad_norm": 0.8671777844429016, "learning_rate": 4.939539440858587e-06, "loss": 0.376, "num_input_tokens_seen": 7117120, "step": 21140 }, { "epoch": 16.34080370942813, "grad_norm": 1.477742314338684, "learning_rate": 4.929482702530078e-06, "loss": 0.4662, "num_input_tokens_seen": 7118848, "step": 21145 }, { "epoch": 16.344667697063368, "grad_norm": 1.7631253004074097, "learning_rate": 4.919435092393032e-06, "loss": 0.3768, "num_input_tokens_seen": 7120224, "step": 21150 }, { "epoch": 16.34853168469861, "grad_norm": 0.7569944262504578, "learning_rate": 4.909396615017164e-06, "loss": 0.365, "num_input_tokens_seen": 7121824, "step": 21155 }, { "epoch": 16.35239567233385, "grad_norm": 0.90385901927948, "learning_rate": 4.899367274968028e-06, "loss": 0.3586, "num_input_tokens_seen": 7123712, "step": 21160 }, { "epoch": 16.356259659969087, "grad_norm": 0.7363958358764648, "learning_rate": 4.889347076807038e-06, "loss": 0.6242, "num_input_tokens_seen": 7125344, "step": 21165 }, { "epoch": 16.36012364760433, "grad_norm": 1.0142924785614014, "learning_rate": 4.879336025091435e-06, "loss": 0.4322, "num_input_tokens_seen": 7127072, "step": 21170 }, { "epoch": 16.363987635239567, "grad_norm": 0.9152746200561523, "learning_rate": 4.869334124374303e-06, "loss": 0.4309, "num_input_tokens_seen": 7128640, "step": 21175 }, { "epoch": 16.367851622874806, "grad_norm": 0.6944326162338257, "learning_rate": 4.859341379204571e-06, "loss": 0.3857, "num_input_tokens_seen": 7130304, "step": 21180 }, { "epoch": 16.371715610510048, "grad_norm": 0.773118257522583, "learning_rate": 4.849357794126999e-06, "loss": 0.3805, "num_input_tokens_seen": 7132096, "step": 21185 }, { "epoch": 16.375579598145286, "grad_norm": 0.6648532748222351, "learning_rate": 4.8393833736821795e-06, "loss": 0.3528, "num_input_tokens_seen": 7133600, "step": 21190 }, { "epoch": 16.379443585780525, "grad_norm": 0.7497718930244446, "learning_rate": 4.8294181224065345e-06, "loss": 0.4002, "num_input_tokens_seen": 7135104, "step": 21195 }, { "epoch": 16.383307573415767, "grad_norm": 0.7684556245803833, "learning_rate": 4.8194620448323294e-06, "loss": 0.3127, "num_input_tokens_seen": 7136704, "step": 21200 }, { "epoch": 16.387171561051005, "grad_norm": 0.7816181182861328, "learning_rate": 4.809515145487642e-06, "loss": 0.3692, "num_input_tokens_seen": 7138304, "step": 21205 }, { "epoch": 16.391035548686244, "grad_norm": 0.8393576145172119, "learning_rate": 4.799577428896385e-06, "loss": 0.3631, "num_input_tokens_seen": 7140064, "step": 21210 }, { "epoch": 16.394899536321482, "grad_norm": 1.1115692853927612, "learning_rate": 4.789648899578278e-06, "loss": 0.4038, "num_input_tokens_seen": 7141728, "step": 21215 }, { "epoch": 16.398763523956724, "grad_norm": 1.070432424545288, "learning_rate": 4.7797295620488954e-06, "loss": 0.512, "num_input_tokens_seen": 7143520, "step": 21220 }, { "epoch": 16.402627511591962, "grad_norm": 1.0975232124328613, "learning_rate": 4.7698194208196045e-06, "loss": 0.3822, "num_input_tokens_seen": 7145216, "step": 21225 }, { "epoch": 16.4064914992272, "grad_norm": 0.7484143376350403, "learning_rate": 4.759918480397585e-06, "loss": 0.4428, "num_input_tokens_seen": 7147072, "step": 21230 }, { "epoch": 16.410355486862443, "grad_norm": 1.5459967851638794, "learning_rate": 4.750026745285863e-06, "loss": 0.4262, "num_input_tokens_seen": 7148608, "step": 21235 }, { "epoch": 16.41421947449768, "grad_norm": 0.696956992149353, "learning_rate": 4.740144219983247e-06, "loss": 0.5701, "num_input_tokens_seen": 7150464, "step": 21240 }, { "epoch": 16.41808346213292, "grad_norm": 0.8304007649421692, "learning_rate": 4.7302709089843744e-06, "loss": 0.4417, "num_input_tokens_seen": 7152032, "step": 21245 }, { "epoch": 16.42194744976816, "grad_norm": 1.1389269828796387, "learning_rate": 4.720406816779679e-06, "loss": 0.3919, "num_input_tokens_seen": 7153536, "step": 21250 }, { "epoch": 16.4258114374034, "grad_norm": 1.0047131776809692, "learning_rate": 4.71055194785541e-06, "loss": 0.5587, "num_input_tokens_seen": 7155200, "step": 21255 }, { "epoch": 16.42967542503864, "grad_norm": 0.9035833477973938, "learning_rate": 4.700706306693628e-06, "loss": 0.3636, "num_input_tokens_seen": 7156960, "step": 21260 }, { "epoch": 16.43353941267388, "grad_norm": 0.7512837648391724, "learning_rate": 4.69086989777218e-06, "loss": 0.4878, "num_input_tokens_seen": 7158624, "step": 21265 }, { "epoch": 16.43740340030912, "grad_norm": 0.9516302347183228, "learning_rate": 4.681042725564735e-06, "loss": 0.4102, "num_input_tokens_seen": 7160352, "step": 21270 }, { "epoch": 16.441267387944357, "grad_norm": 0.5707645416259766, "learning_rate": 4.671224794540746e-06, "loss": 0.4697, "num_input_tokens_seen": 7161952, "step": 21275 }, { "epoch": 16.4451313755796, "grad_norm": 0.9080810546875, "learning_rate": 4.661416109165462e-06, "loss": 0.3691, "num_input_tokens_seen": 7163744, "step": 21280 }, { "epoch": 16.448995363214838, "grad_norm": 0.9627942442893982, "learning_rate": 4.651616673899936e-06, "loss": 0.4158, "num_input_tokens_seen": 7165376, "step": 21285 }, { "epoch": 16.452859350850076, "grad_norm": 1.256012201309204, "learning_rate": 4.641826493201007e-06, "loss": 0.4044, "num_input_tokens_seen": 7167136, "step": 21290 }, { "epoch": 16.456723338485318, "grad_norm": 1.1898809671401978, "learning_rate": 4.632045571521304e-06, "loss": 0.4504, "num_input_tokens_seen": 7168640, "step": 21295 }, { "epoch": 16.460587326120557, "grad_norm": 1.3859293460845947, "learning_rate": 4.6222739133092605e-06, "loss": 0.5299, "num_input_tokens_seen": 7170336, "step": 21300 }, { "epoch": 16.464451313755795, "grad_norm": 1.334302544593811, "learning_rate": 4.6125115230090724e-06, "loss": 0.3374, "num_input_tokens_seen": 7172384, "step": 21305 }, { "epoch": 16.468315301391037, "grad_norm": 0.9215648174285889, "learning_rate": 4.602758405060745e-06, "loss": 0.3868, "num_input_tokens_seen": 7174240, "step": 21310 }, { "epoch": 16.472179289026275, "grad_norm": 1.4492084980010986, "learning_rate": 4.59301456390005e-06, "loss": 0.3457, "num_input_tokens_seen": 7175840, "step": 21315 }, { "epoch": 16.476043276661514, "grad_norm": 0.9113069176673889, "learning_rate": 4.583280003958546e-06, "loss": 0.4904, "num_input_tokens_seen": 7177408, "step": 21320 }, { "epoch": 16.479907264296756, "grad_norm": 0.9055107235908508, "learning_rate": 4.573554729663562e-06, "loss": 0.4987, "num_input_tokens_seen": 7178848, "step": 21325 }, { "epoch": 16.483771251931994, "grad_norm": 1.6384018659591675, "learning_rate": 4.563838745438215e-06, "loss": 0.4319, "num_input_tokens_seen": 7180384, "step": 21330 }, { "epoch": 16.487635239567233, "grad_norm": 1.2258089780807495, "learning_rate": 4.554132055701396e-06, "loss": 0.5309, "num_input_tokens_seen": 7181888, "step": 21335 }, { "epoch": 16.491499227202475, "grad_norm": 0.8112072348594666, "learning_rate": 4.544434664867761e-06, "loss": 0.397, "num_input_tokens_seen": 7183872, "step": 21340 }, { "epoch": 16.495363214837713, "grad_norm": 1.2916611433029175, "learning_rate": 4.534746577347748e-06, "loss": 0.3555, "num_input_tokens_seen": 7185504, "step": 21345 }, { "epoch": 16.49922720247295, "grad_norm": 0.9363958835601807, "learning_rate": 4.525067797547553e-06, "loss": 0.3937, "num_input_tokens_seen": 7187072, "step": 21350 }, { "epoch": 16.50309119010819, "grad_norm": 1.1994401216506958, "learning_rate": 4.515398329869144e-06, "loss": 0.7128, "num_input_tokens_seen": 7189280, "step": 21355 }, { "epoch": 16.506955177743432, "grad_norm": 0.8383431434631348, "learning_rate": 4.505738178710253e-06, "loss": 0.3322, "num_input_tokens_seen": 7190976, "step": 21360 }, { "epoch": 16.51081916537867, "grad_norm": 2.9001593589782715, "learning_rate": 4.496087348464365e-06, "loss": 0.5192, "num_input_tokens_seen": 7192672, "step": 21365 }, { "epoch": 16.51468315301391, "grad_norm": 2.272334337234497, "learning_rate": 4.486445843520751e-06, "loss": 0.4454, "num_input_tokens_seen": 7194528, "step": 21370 }, { "epoch": 16.51854714064915, "grad_norm": 1.5758475065231323, "learning_rate": 4.4768136682644124e-06, "loss": 0.6269, "num_input_tokens_seen": 7196128, "step": 21375 }, { "epoch": 16.52241112828439, "grad_norm": 1.244209885597229, "learning_rate": 4.467190827076134e-06, "loss": 0.7029, "num_input_tokens_seen": 7197696, "step": 21380 }, { "epoch": 16.526275115919628, "grad_norm": 1.6390953063964844, "learning_rate": 4.457577324332432e-06, "loss": 0.5809, "num_input_tokens_seen": 7199552, "step": 21385 }, { "epoch": 16.53013910355487, "grad_norm": 1.8763552904129028, "learning_rate": 4.447973164405586e-06, "loss": 0.4257, "num_input_tokens_seen": 7201216, "step": 21390 }, { "epoch": 16.534003091190108, "grad_norm": 0.7716416716575623, "learning_rate": 4.438378351663627e-06, "loss": 0.5449, "num_input_tokens_seen": 7202848, "step": 21395 }, { "epoch": 16.537867078825347, "grad_norm": 1.8236454725265503, "learning_rate": 4.428792890470332e-06, "loss": 0.4978, "num_input_tokens_seen": 7204512, "step": 21400 }, { "epoch": 16.54173106646059, "grad_norm": 0.9453743696212769, "learning_rate": 4.419216785185221e-06, "loss": 0.3743, "num_input_tokens_seen": 7206112, "step": 21405 }, { "epoch": 16.545595054095827, "grad_norm": 1.160061001777649, "learning_rate": 4.4096500401635734e-06, "loss": 0.4236, "num_input_tokens_seen": 7207616, "step": 21410 }, { "epoch": 16.549459041731065, "grad_norm": 0.5981807112693787, "learning_rate": 4.400092659756397e-06, "loss": 0.3834, "num_input_tokens_seen": 7209344, "step": 21415 }, { "epoch": 16.553323029366307, "grad_norm": 1.405393123626709, "learning_rate": 4.390544648310449e-06, "loss": 0.3678, "num_input_tokens_seen": 7210848, "step": 21420 }, { "epoch": 16.557187017001546, "grad_norm": 0.8211390972137451, "learning_rate": 4.38100601016822e-06, "loss": 0.4637, "num_input_tokens_seen": 7212352, "step": 21425 }, { "epoch": 16.561051004636784, "grad_norm": 0.9342989921569824, "learning_rate": 4.371476749667941e-06, "loss": 0.423, "num_input_tokens_seen": 7214080, "step": 21430 }, { "epoch": 16.564914992272026, "grad_norm": 1.1791805028915405, "learning_rate": 4.361956871143577e-06, "loss": 0.4291, "num_input_tokens_seen": 7215648, "step": 21435 }, { "epoch": 16.568778979907265, "grad_norm": 1.1847107410430908, "learning_rate": 4.352446378924818e-06, "loss": 0.4679, "num_input_tokens_seen": 7217472, "step": 21440 }, { "epoch": 16.572642967542503, "grad_norm": 1.5960273742675781, "learning_rate": 4.342945277337104e-06, "loss": 0.4266, "num_input_tokens_seen": 7219392, "step": 21445 }, { "epoch": 16.576506955177745, "grad_norm": 1.225079894065857, "learning_rate": 4.333453570701587e-06, "loss": 0.4547, "num_input_tokens_seen": 7221088, "step": 21450 }, { "epoch": 16.580370942812984, "grad_norm": 0.7683652639389038, "learning_rate": 4.32397126333515e-06, "loss": 0.4071, "num_input_tokens_seen": 7222656, "step": 21455 }, { "epoch": 16.584234930448222, "grad_norm": 0.9497233033180237, "learning_rate": 4.314498359550412e-06, "loss": 0.5481, "num_input_tokens_seen": 7224128, "step": 21460 }, { "epoch": 16.58809891808346, "grad_norm": 1.2866096496582031, "learning_rate": 4.3050348636556994e-06, "loss": 0.4813, "num_input_tokens_seen": 7225824, "step": 21465 }, { "epoch": 16.591962905718702, "grad_norm": 1.071914792060852, "learning_rate": 4.295580779955066e-06, "loss": 0.4227, "num_input_tokens_seen": 7227424, "step": 21470 }, { "epoch": 16.59582689335394, "grad_norm": 2.3161988258361816, "learning_rate": 4.286136112748285e-06, "loss": 0.4819, "num_input_tokens_seen": 7228960, "step": 21475 }, { "epoch": 16.59969088098918, "grad_norm": 0.9388306736946106, "learning_rate": 4.276700866330854e-06, "loss": 0.4352, "num_input_tokens_seen": 7230496, "step": 21480 }, { "epoch": 16.60355486862442, "grad_norm": 1.194115161895752, "learning_rate": 4.267275044993979e-06, "loss": 0.4496, "num_input_tokens_seen": 7232128, "step": 21485 }, { "epoch": 16.60741885625966, "grad_norm": 0.8420922756195068, "learning_rate": 4.257858653024577e-06, "loss": 0.3769, "num_input_tokens_seen": 7233984, "step": 21490 }, { "epoch": 16.611282843894898, "grad_norm": 1.3938268423080444, "learning_rate": 4.248451694705271e-06, "loss": 0.5236, "num_input_tokens_seen": 7235744, "step": 21495 }, { "epoch": 16.61514683153014, "grad_norm": 0.7959346771240234, "learning_rate": 4.239054174314417e-06, "loss": 0.3939, "num_input_tokens_seen": 7237440, "step": 21500 }, { "epoch": 16.61901081916538, "grad_norm": 0.9321817755699158, "learning_rate": 4.229666096126056e-06, "loss": 0.4092, "num_input_tokens_seen": 7239488, "step": 21505 }, { "epoch": 16.622874806800617, "grad_norm": 1.109991192817688, "learning_rate": 4.220287464409939e-06, "loss": 0.424, "num_input_tokens_seen": 7241408, "step": 21510 }, { "epoch": 16.62673879443586, "grad_norm": 0.7917014956474304, "learning_rate": 4.210918283431534e-06, "loss": 0.4341, "num_input_tokens_seen": 7242880, "step": 21515 }, { "epoch": 16.630602782071097, "grad_norm": 0.6526775360107422, "learning_rate": 4.201558557451993e-06, "loss": 0.5321, "num_input_tokens_seen": 7244672, "step": 21520 }, { "epoch": 16.634466769706336, "grad_norm": 1.3858833312988281, "learning_rate": 4.192208290728178e-06, "loss": 0.4424, "num_input_tokens_seen": 7246368, "step": 21525 }, { "epoch": 16.638330757341578, "grad_norm": 0.9636268019676208, "learning_rate": 4.182867487512645e-06, "loss": 0.4262, "num_input_tokens_seen": 7248032, "step": 21530 }, { "epoch": 16.642194744976816, "grad_norm": 1.0828304290771484, "learning_rate": 4.173536152053642e-06, "loss": 0.4158, "num_input_tokens_seen": 7249472, "step": 21535 }, { "epoch": 16.646058732612055, "grad_norm": 1.7849332094192505, "learning_rate": 4.164214288595128e-06, "loss": 0.4238, "num_input_tokens_seen": 7250912, "step": 21540 }, { "epoch": 16.649922720247297, "grad_norm": 2.39823579788208, "learning_rate": 4.154901901376729e-06, "loss": 0.4327, "num_input_tokens_seen": 7252864, "step": 21545 }, { "epoch": 16.653786707882535, "grad_norm": 0.9884030818939209, "learning_rate": 4.14559899463379e-06, "loss": 0.4695, "num_input_tokens_seen": 7254432, "step": 21550 }, { "epoch": 16.657650695517773, "grad_norm": 0.738211989402771, "learning_rate": 4.136305572597318e-06, "loss": 0.374, "num_input_tokens_seen": 7256032, "step": 21555 }, { "epoch": 16.661514683153015, "grad_norm": 2.347569227218628, "learning_rate": 4.127021639494022e-06, "loss": 0.3999, "num_input_tokens_seen": 7257664, "step": 21560 }, { "epoch": 16.665378670788254, "grad_norm": 1.120323657989502, "learning_rate": 4.117747199546285e-06, "loss": 0.386, "num_input_tokens_seen": 7259200, "step": 21565 }, { "epoch": 16.669242658423492, "grad_norm": 0.7697380185127258, "learning_rate": 4.108482256972182e-06, "loss": 0.3699, "num_input_tokens_seen": 7260992, "step": 21570 }, { "epoch": 16.673106646058734, "grad_norm": 0.730789303779602, "learning_rate": 4.099226815985458e-06, "loss": 0.5196, "num_input_tokens_seen": 7262560, "step": 21575 }, { "epoch": 16.676970633693973, "grad_norm": 0.8237324357032776, "learning_rate": 4.089980880795543e-06, "loss": 0.546, "num_input_tokens_seen": 7264256, "step": 21580 }, { "epoch": 16.68083462132921, "grad_norm": 0.7496929168701172, "learning_rate": 4.08074445560756e-06, "loss": 0.4048, "num_input_tokens_seen": 7266112, "step": 21585 }, { "epoch": 16.684698608964453, "grad_norm": 0.7617812156677246, "learning_rate": 4.071517544622278e-06, "loss": 0.4649, "num_input_tokens_seen": 7267552, "step": 21590 }, { "epoch": 16.68856259659969, "grad_norm": 0.8270379900932312, "learning_rate": 4.0623001520361494e-06, "loss": 0.5215, "num_input_tokens_seen": 7269216, "step": 21595 }, { "epoch": 16.69242658423493, "grad_norm": 1.046746850013733, "learning_rate": 4.053092282041307e-06, "loss": 0.4275, "num_input_tokens_seen": 7271072, "step": 21600 }, { "epoch": 16.69629057187017, "grad_norm": 1.3784921169281006, "learning_rate": 4.043893938825538e-06, "loss": 0.4714, "num_input_tokens_seen": 7272640, "step": 21605 }, { "epoch": 16.70015455950541, "grad_norm": 0.8949161767959595, "learning_rate": 4.034705126572299e-06, "loss": 0.3838, "num_input_tokens_seen": 7274400, "step": 21610 }, { "epoch": 16.70401854714065, "grad_norm": 1.0412694215774536, "learning_rate": 4.025525849460729e-06, "loss": 0.3798, "num_input_tokens_seen": 7276064, "step": 21615 }, { "epoch": 16.707882534775887, "grad_norm": 0.7938412427902222, "learning_rate": 4.016356111665617e-06, "loss": 0.625, "num_input_tokens_seen": 7277952, "step": 21620 }, { "epoch": 16.71174652241113, "grad_norm": 0.8674513101577759, "learning_rate": 4.007195917357412e-06, "loss": 0.3924, "num_input_tokens_seen": 7279456, "step": 21625 }, { "epoch": 16.715610510046368, "grad_norm": 1.8372716903686523, "learning_rate": 3.998045270702227e-06, "loss": 0.4877, "num_input_tokens_seen": 7280832, "step": 21630 }, { "epoch": 16.719474497681606, "grad_norm": 1.0757657289505005, "learning_rate": 3.988904175861827e-06, "loss": 0.4355, "num_input_tokens_seen": 7282304, "step": 21635 }, { "epoch": 16.723338485316848, "grad_norm": 1.5956655740737915, "learning_rate": 3.979772636993636e-06, "loss": 0.4523, "num_input_tokens_seen": 7283904, "step": 21640 }, { "epoch": 16.727202472952087, "grad_norm": 0.9709840416908264, "learning_rate": 3.970650658250732e-06, "loss": 0.417, "num_input_tokens_seen": 7285952, "step": 21645 }, { "epoch": 16.731066460587325, "grad_norm": 1.3020144701004028, "learning_rate": 3.961538243781854e-06, "loss": 0.446, "num_input_tokens_seen": 7287584, "step": 21650 }, { "epoch": 16.734930448222567, "grad_norm": 0.8285894989967346, "learning_rate": 3.9524353977313715e-06, "loss": 0.4741, "num_input_tokens_seen": 7289408, "step": 21655 }, { "epoch": 16.738794435857805, "grad_norm": 0.9484239220619202, "learning_rate": 3.943342124239324e-06, "loss": 0.3547, "num_input_tokens_seen": 7290784, "step": 21660 }, { "epoch": 16.742658423493044, "grad_norm": 0.9000973701477051, "learning_rate": 3.934258427441381e-06, "loss": 0.4466, "num_input_tokens_seen": 7292576, "step": 21665 }, { "epoch": 16.746522411128286, "grad_norm": 0.8341119885444641, "learning_rate": 3.925184311468865e-06, "loss": 0.5499, "num_input_tokens_seen": 7294336, "step": 21670 }, { "epoch": 16.750386398763524, "grad_norm": 2.1546478271484375, "learning_rate": 3.916119780448735e-06, "loss": 0.8669, "num_input_tokens_seen": 7296192, "step": 21675 }, { "epoch": 16.754250386398763, "grad_norm": 0.9342867732048035, "learning_rate": 3.907064838503591e-06, "loss": 0.6605, "num_input_tokens_seen": 7297984, "step": 21680 }, { "epoch": 16.758114374034005, "grad_norm": 1.0701181888580322, "learning_rate": 3.898019489751684e-06, "loss": 0.397, "num_input_tokens_seen": 7299776, "step": 21685 }, { "epoch": 16.761978361669243, "grad_norm": 1.2387118339538574, "learning_rate": 3.8889837383068864e-06, "loss": 0.5004, "num_input_tokens_seen": 7301440, "step": 21690 }, { "epoch": 16.76584234930448, "grad_norm": 1.0029878616333008, "learning_rate": 3.879957588278707e-06, "loss": 0.3982, "num_input_tokens_seen": 7303104, "step": 21695 }, { "epoch": 16.769706336939723, "grad_norm": 0.9909317493438721, "learning_rate": 3.870941043772308e-06, "loss": 0.3852, "num_input_tokens_seen": 7304960, "step": 21700 }, { "epoch": 16.773570324574962, "grad_norm": 0.6568207740783691, "learning_rate": 3.8619341088884595e-06, "loss": 0.4288, "num_input_tokens_seen": 7306560, "step": 21705 }, { "epoch": 16.7774343122102, "grad_norm": 0.680321216583252, "learning_rate": 3.852936787723568e-06, "loss": 0.3873, "num_input_tokens_seen": 7308480, "step": 21710 }, { "epoch": 16.78129829984544, "grad_norm": 0.9371611475944519, "learning_rate": 3.843949084369663e-06, "loss": 0.4172, "num_input_tokens_seen": 7310208, "step": 21715 }, { "epoch": 16.78516228748068, "grad_norm": 1.1952711343765259, "learning_rate": 3.83497100291442e-06, "loss": 0.4125, "num_input_tokens_seen": 7311936, "step": 21720 }, { "epoch": 16.78902627511592, "grad_norm": 0.7728267908096313, "learning_rate": 3.826002547441118e-06, "loss": 0.3475, "num_input_tokens_seen": 7313632, "step": 21725 }, { "epoch": 16.792890262751158, "grad_norm": 1.787865161895752, "learning_rate": 3.817043722028663e-06, "loss": 0.5803, "num_input_tokens_seen": 7315296, "step": 21730 }, { "epoch": 16.7967542503864, "grad_norm": 0.8406714797019958, "learning_rate": 3.808094530751577e-06, "loss": 0.5427, "num_input_tokens_seen": 7317056, "step": 21735 }, { "epoch": 16.800618238021638, "grad_norm": 1.4689064025878906, "learning_rate": 3.7991549776800197e-06, "loss": 0.5324, "num_input_tokens_seen": 7318624, "step": 21740 }, { "epoch": 16.804482225656876, "grad_norm": 1.9284207820892334, "learning_rate": 3.7902250668797435e-06, "loss": 0.4926, "num_input_tokens_seen": 7320224, "step": 21745 }, { "epoch": 16.80834621329212, "grad_norm": 1.3494869470596313, "learning_rate": 3.7813048024121196e-06, "loss": 0.6839, "num_input_tokens_seen": 7321824, "step": 21750 }, { "epoch": 16.812210200927357, "grad_norm": 1.17023766040802, "learning_rate": 3.7723941883341526e-06, "loss": 0.494, "num_input_tokens_seen": 7323776, "step": 21755 }, { "epoch": 16.816074188562595, "grad_norm": 0.7972486019134521, "learning_rate": 3.7634932286984363e-06, "loss": 0.3398, "num_input_tokens_seen": 7325504, "step": 21760 }, { "epoch": 16.819938176197837, "grad_norm": 1.1872851848602295, "learning_rate": 3.7546019275531806e-06, "loss": 0.4096, "num_input_tokens_seen": 7327264, "step": 21765 }, { "epoch": 16.823802163833076, "grad_norm": 1.484674334526062, "learning_rate": 3.7457202889422004e-06, "loss": 0.472, "num_input_tokens_seen": 7329024, "step": 21770 }, { "epoch": 16.827666151468314, "grad_norm": 1.3663218021392822, "learning_rate": 3.736848316904923e-06, "loss": 0.4381, "num_input_tokens_seen": 7330496, "step": 21775 }, { "epoch": 16.831530139103556, "grad_norm": 1.0733157396316528, "learning_rate": 3.727986015476362e-06, "loss": 0.5458, "num_input_tokens_seen": 7332256, "step": 21780 }, { "epoch": 16.835394126738795, "grad_norm": 0.8520432710647583, "learning_rate": 3.7191333886871543e-06, "loss": 0.4374, "num_input_tokens_seen": 7333856, "step": 21785 }, { "epoch": 16.839258114374033, "grad_norm": 0.617622971534729, "learning_rate": 3.710290440563535e-06, "loss": 0.4722, "num_input_tokens_seen": 7335584, "step": 21790 }, { "epoch": 16.843122102009275, "grad_norm": 1.4153077602386475, "learning_rate": 3.7014571751273207e-06, "loss": 0.589, "num_input_tokens_seen": 7337152, "step": 21795 }, { "epoch": 16.846986089644513, "grad_norm": 1.3480161428451538, "learning_rate": 3.692633596395936e-06, "loss": 0.5019, "num_input_tokens_seen": 7338880, "step": 21800 }, { "epoch": 16.850850077279752, "grad_norm": 0.6859736442565918, "learning_rate": 3.6838197083823965e-06, "loss": 0.3832, "num_input_tokens_seen": 7340384, "step": 21805 }, { "epoch": 16.854714064914994, "grad_norm": 1.676900863647461, "learning_rate": 3.675015515095312e-06, "loss": 0.4282, "num_input_tokens_seen": 7342144, "step": 21810 }, { "epoch": 16.858578052550232, "grad_norm": 1.187496542930603, "learning_rate": 3.6662210205388766e-06, "loss": 0.4469, "num_input_tokens_seen": 7343680, "step": 21815 }, { "epoch": 16.86244204018547, "grad_norm": 0.9099264144897461, "learning_rate": 3.657436228712882e-06, "loss": 0.4259, "num_input_tokens_seen": 7345184, "step": 21820 }, { "epoch": 16.866306027820713, "grad_norm": 0.9624759554862976, "learning_rate": 3.648661143612711e-06, "loss": 0.3964, "num_input_tokens_seen": 7346688, "step": 21825 }, { "epoch": 16.87017001545595, "grad_norm": 1.3990389108657837, "learning_rate": 3.6398957692293205e-06, "loss": 0.4228, "num_input_tokens_seen": 7348544, "step": 21830 }, { "epoch": 16.87403400309119, "grad_norm": 1.3709560632705688, "learning_rate": 3.631140109549258e-06, "loss": 0.3382, "num_input_tokens_seen": 7350400, "step": 21835 }, { "epoch": 16.87789799072643, "grad_norm": 1.5521663427352905, "learning_rate": 3.622394168554644e-06, "loss": 0.452, "num_input_tokens_seen": 7352256, "step": 21840 }, { "epoch": 16.88176197836167, "grad_norm": 1.0401087999343872, "learning_rate": 3.613657950223187e-06, "loss": 0.4049, "num_input_tokens_seen": 7353856, "step": 21845 }, { "epoch": 16.88562596599691, "grad_norm": 1.049354910850525, "learning_rate": 3.6049314585281686e-06, "loss": 0.455, "num_input_tokens_seen": 7355648, "step": 21850 }, { "epoch": 16.889489953632147, "grad_norm": 0.8547669649124146, "learning_rate": 3.5962146974384575e-06, "loss": 0.7013, "num_input_tokens_seen": 7357056, "step": 21855 }, { "epoch": 16.89335394126739, "grad_norm": 0.7887431979179382, "learning_rate": 3.5875076709184773e-06, "loss": 0.5847, "num_input_tokens_seen": 7358752, "step": 21860 }, { "epoch": 16.897217928902627, "grad_norm": 0.8748289942741394, "learning_rate": 3.578810382928249e-06, "loss": 0.3385, "num_input_tokens_seen": 7360448, "step": 21865 }, { "epoch": 16.901081916537866, "grad_norm": 0.9238148927688599, "learning_rate": 3.570122837423348e-06, "loss": 0.5463, "num_input_tokens_seen": 7362464, "step": 21870 }, { "epoch": 16.904945904173108, "grad_norm": 1.2964575290679932, "learning_rate": 3.5614450383549157e-06, "loss": 0.4296, "num_input_tokens_seen": 7364128, "step": 21875 }, { "epoch": 16.908809891808346, "grad_norm": 0.8784136176109314, "learning_rate": 3.5527769896696706e-06, "loss": 0.4631, "num_input_tokens_seen": 7365984, "step": 21880 }, { "epoch": 16.912673879443584, "grad_norm": 0.5986944437026978, "learning_rate": 3.5441186953098894e-06, "loss": 0.412, "num_input_tokens_seen": 7367648, "step": 21885 }, { "epoch": 16.916537867078826, "grad_norm": 1.1120493412017822, "learning_rate": 3.535470159213425e-06, "loss": 0.5024, "num_input_tokens_seen": 7369408, "step": 21890 }, { "epoch": 16.920401854714065, "grad_norm": 1.4003939628601074, "learning_rate": 3.5268313853136754e-06, "loss": 0.367, "num_input_tokens_seen": 7371200, "step": 21895 }, { "epoch": 16.924265842349303, "grad_norm": 1.2608189582824707, "learning_rate": 3.5182023775396062e-06, "loss": 0.5061, "num_input_tokens_seen": 7372800, "step": 21900 }, { "epoch": 16.928129829984545, "grad_norm": 1.000364065170288, "learning_rate": 3.50958313981575e-06, "loss": 0.3764, "num_input_tokens_seen": 7374400, "step": 21905 }, { "epoch": 16.931993817619784, "grad_norm": 1.0682239532470703, "learning_rate": 3.500973676062183e-06, "loss": 0.6017, "num_input_tokens_seen": 7375968, "step": 21910 }, { "epoch": 16.935857805255022, "grad_norm": 1.6834853887557983, "learning_rate": 3.492373990194542e-06, "loss": 0.4019, "num_input_tokens_seen": 7377536, "step": 21915 }, { "epoch": 16.939721792890264, "grad_norm": 0.7105234861373901, "learning_rate": 3.483784086124009e-06, "loss": 0.3627, "num_input_tokens_seen": 7379392, "step": 21920 }, { "epoch": 16.943585780525503, "grad_norm": 1.4252756834030151, "learning_rate": 3.4752039677573316e-06, "loss": 0.4868, "num_input_tokens_seen": 7381312, "step": 21925 }, { "epoch": 16.94744976816074, "grad_norm": 0.967720091342926, "learning_rate": 3.4666336389967996e-06, "loss": 0.6452, "num_input_tokens_seen": 7383008, "step": 21930 }, { "epoch": 16.951313755795983, "grad_norm": 1.2478020191192627, "learning_rate": 3.458073103740245e-06, "loss": 0.447, "num_input_tokens_seen": 7384512, "step": 21935 }, { "epoch": 16.95517774343122, "grad_norm": 0.7345311045646667, "learning_rate": 3.44952236588105e-06, "loss": 0.5333, "num_input_tokens_seen": 7386080, "step": 21940 }, { "epoch": 16.95904173106646, "grad_norm": 0.6988607048988342, "learning_rate": 3.440981429308146e-06, "loss": 0.3929, "num_input_tokens_seen": 7387808, "step": 21945 }, { "epoch": 16.962905718701702, "grad_norm": 1.0403207540512085, "learning_rate": 3.4324502979060006e-06, "loss": 0.5257, "num_input_tokens_seen": 7389440, "step": 21950 }, { "epoch": 16.96676970633694, "grad_norm": 1.7408777475357056, "learning_rate": 3.423928975554616e-06, "loss": 0.4576, "num_input_tokens_seen": 7391168, "step": 21955 }, { "epoch": 16.97063369397218, "grad_norm": 1.1809247732162476, "learning_rate": 3.415417466129556e-06, "loss": 0.4404, "num_input_tokens_seen": 7392736, "step": 21960 }, { "epoch": 16.974497681607417, "grad_norm": 0.9111348390579224, "learning_rate": 3.4069157735018953e-06, "loss": 0.3785, "num_input_tokens_seen": 7394400, "step": 21965 }, { "epoch": 16.97836166924266, "grad_norm": 1.7484067678451538, "learning_rate": 3.3984239015382557e-06, "loss": 0.4465, "num_input_tokens_seen": 7395840, "step": 21970 }, { "epoch": 16.982225656877898, "grad_norm": 1.707769513130188, "learning_rate": 3.3899418541007947e-06, "loss": 0.4302, "num_input_tokens_seen": 7397600, "step": 21975 }, { "epoch": 16.986089644513136, "grad_norm": 1.0131007432937622, "learning_rate": 3.381469635047191e-06, "loss": 0.3762, "num_input_tokens_seen": 7399360, "step": 21980 }, { "epoch": 16.989953632148378, "grad_norm": 1.0282741785049438, "learning_rate": 3.3730072482306697e-06, "loss": 0.5063, "num_input_tokens_seen": 7401120, "step": 21985 }, { "epoch": 16.993817619783616, "grad_norm": 0.6850438714027405, "learning_rate": 3.3645546974999636e-06, "loss": 0.3746, "num_input_tokens_seen": 7402656, "step": 21990 }, { "epoch": 16.997681607418855, "grad_norm": 0.9365905523300171, "learning_rate": 3.356111986699359e-06, "loss": 0.3908, "num_input_tokens_seen": 7404192, "step": 21995 }, { "epoch": 17.0, "eval_loss": 0.4629610776901245, "eval_runtime": 6.3721, "eval_samples_per_second": 90.238, "eval_steps_per_second": 22.599, "num_input_tokens_seen": 7405040, "step": 21998 }, { "epoch": 17.001545595054097, "grad_norm": 1.0614560842514038, "learning_rate": 3.3476791196686426e-06, "loss": 0.3935, "num_input_tokens_seen": 7405616, "step": 22000 }, { "epoch": 17.005409582689335, "grad_norm": 0.8906858563423157, "learning_rate": 3.3392561002431323e-06, "loss": 0.3949, "num_input_tokens_seen": 7407216, "step": 22005 }, { "epoch": 17.009273570324574, "grad_norm": 1.1906455755233765, "learning_rate": 3.3308429322536692e-06, "loss": 0.3855, "num_input_tokens_seen": 7408656, "step": 22010 }, { "epoch": 17.013137557959816, "grad_norm": 0.8375556468963623, "learning_rate": 3.3224396195266127e-06, "loss": 0.4223, "num_input_tokens_seen": 7410288, "step": 22015 }, { "epoch": 17.017001545595054, "grad_norm": 1.1962339878082275, "learning_rate": 3.31404616588383e-06, "loss": 0.3787, "num_input_tokens_seen": 7412272, "step": 22020 }, { "epoch": 17.020865533230292, "grad_norm": 0.8182069659233093, "learning_rate": 3.3056625751427317e-06, "loss": 0.5173, "num_input_tokens_seen": 7414288, "step": 22025 }, { "epoch": 17.024729520865534, "grad_norm": 0.9161118268966675, "learning_rate": 3.297288851116212e-06, "loss": 0.4467, "num_input_tokens_seen": 7415888, "step": 22030 }, { "epoch": 17.028593508500773, "grad_norm": 0.9222292304039001, "learning_rate": 3.2889249976126995e-06, "loss": 0.4617, "num_input_tokens_seen": 7417488, "step": 22035 }, { "epoch": 17.03245749613601, "grad_norm": 0.7372550368309021, "learning_rate": 3.280571018436121e-06, "loss": 0.5931, "num_input_tokens_seen": 7419280, "step": 22040 }, { "epoch": 17.036321483771253, "grad_norm": 1.050800085067749, "learning_rate": 3.272226917385915e-06, "loss": 0.4652, "num_input_tokens_seen": 7421008, "step": 22045 }, { "epoch": 17.04018547140649, "grad_norm": 0.9628223776817322, "learning_rate": 3.263892698257029e-06, "loss": 0.6694, "num_input_tokens_seen": 7422480, "step": 22050 }, { "epoch": 17.04404945904173, "grad_norm": 2.115872383117676, "learning_rate": 3.2555683648399118e-06, "loss": 0.4293, "num_input_tokens_seen": 7424208, "step": 22055 }, { "epoch": 17.047913446676972, "grad_norm": 1.3395211696624756, "learning_rate": 3.2472539209205316e-06, "loss": 0.4654, "num_input_tokens_seen": 7426224, "step": 22060 }, { "epoch": 17.05177743431221, "grad_norm": 0.9713035821914673, "learning_rate": 3.238949370280331e-06, "loss": 0.5226, "num_input_tokens_seen": 7427760, "step": 22065 }, { "epoch": 17.05564142194745, "grad_norm": 0.9455863237380981, "learning_rate": 3.230654716696288e-06, "loss": 0.443, "num_input_tokens_seen": 7429456, "step": 22070 }, { "epoch": 17.05950540958269, "grad_norm": 1.281211018562317, "learning_rate": 3.2223699639408493e-06, "loss": 0.3963, "num_input_tokens_seen": 7431056, "step": 22075 }, { "epoch": 17.06336939721793, "grad_norm": 0.9020193815231323, "learning_rate": 3.2140951157819703e-06, "loss": 0.6065, "num_input_tokens_seen": 7432784, "step": 22080 }, { "epoch": 17.067233384853168, "grad_norm": 0.9658635854721069, "learning_rate": 3.2058301759831073e-06, "loss": 0.5156, "num_input_tokens_seen": 7434640, "step": 22085 }, { "epoch": 17.071097372488406, "grad_norm": 0.8903360962867737, "learning_rate": 3.197575148303192e-06, "loss": 0.4743, "num_input_tokens_seen": 7436208, "step": 22090 }, { "epoch": 17.07496136012365, "grad_norm": 1.1155318021774292, "learning_rate": 3.1893300364966766e-06, "loss": 0.3847, "num_input_tokens_seen": 7437968, "step": 22095 }, { "epoch": 17.078825347758887, "grad_norm": 0.7940590977668762, "learning_rate": 3.181094844313473e-06, "loss": 0.3768, "num_input_tokens_seen": 7439888, "step": 22100 }, { "epoch": 17.082689335394125, "grad_norm": 0.8966137766838074, "learning_rate": 3.1728695754990074e-06, "loss": 0.4363, "num_input_tokens_seen": 7441520, "step": 22105 }, { "epoch": 17.086553323029367, "grad_norm": 2.4444923400878906, "learning_rate": 3.164654233794176e-06, "loss": 0.3954, "num_input_tokens_seen": 7443152, "step": 22110 }, { "epoch": 17.090417310664606, "grad_norm": 0.8472685813903809, "learning_rate": 3.1564488229353677e-06, "loss": 0.5292, "num_input_tokens_seen": 7444816, "step": 22115 }, { "epoch": 17.094281298299844, "grad_norm": 1.5969828367233276, "learning_rate": 3.1482533466544477e-06, "loss": 0.3542, "num_input_tokens_seen": 7446320, "step": 22120 }, { "epoch": 17.098145285935086, "grad_norm": 1.0729894638061523, "learning_rate": 3.140067808678773e-06, "loss": 0.5019, "num_input_tokens_seen": 7447952, "step": 22125 }, { "epoch": 17.102009273570324, "grad_norm": 0.9288684129714966, "learning_rate": 3.131892212731169e-06, "loss": 0.3293, "num_input_tokens_seen": 7449584, "step": 22130 }, { "epoch": 17.105873261205563, "grad_norm": 1.3973522186279297, "learning_rate": 3.1237265625299524e-06, "loss": 0.4169, "num_input_tokens_seen": 7451088, "step": 22135 }, { "epoch": 17.109737248840805, "grad_norm": 0.6945351958274841, "learning_rate": 3.1155708617889023e-06, "loss": 0.4104, "num_input_tokens_seen": 7452816, "step": 22140 }, { "epoch": 17.113601236476043, "grad_norm": 1.3559406995773315, "learning_rate": 3.107425114217291e-06, "loss": 0.5345, "num_input_tokens_seen": 7454672, "step": 22145 }, { "epoch": 17.11746522411128, "grad_norm": 1.0798084735870361, "learning_rate": 3.0992893235198466e-06, "loss": 0.3362, "num_input_tokens_seen": 7456432, "step": 22150 }, { "epoch": 17.121329211746524, "grad_norm": 1.1818898916244507, "learning_rate": 3.091163493396776e-06, "loss": 0.3511, "num_input_tokens_seen": 7457840, "step": 22155 }, { "epoch": 17.125193199381762, "grad_norm": 0.8202179074287415, "learning_rate": 3.0830476275437533e-06, "loss": 0.4174, "num_input_tokens_seen": 7459600, "step": 22160 }, { "epoch": 17.129057187017, "grad_norm": 0.6414034366607666, "learning_rate": 3.0749417296519228e-06, "loss": 0.4517, "num_input_tokens_seen": 7461232, "step": 22165 }, { "epoch": 17.132921174652243, "grad_norm": 0.8066489100456238, "learning_rate": 3.066845803407903e-06, "loss": 0.7117, "num_input_tokens_seen": 7462992, "step": 22170 }, { "epoch": 17.13678516228748, "grad_norm": 0.7051856517791748, "learning_rate": 3.0587598524937617e-06, "loss": 0.4026, "num_input_tokens_seen": 7464592, "step": 22175 }, { "epoch": 17.14064914992272, "grad_norm": 1.1680487394332886, "learning_rate": 3.050683880587038e-06, "loss": 0.5129, "num_input_tokens_seen": 7466352, "step": 22180 }, { "epoch": 17.14451313755796, "grad_norm": 1.0563050508499146, "learning_rate": 3.0426178913607383e-06, "loss": 0.3837, "num_input_tokens_seen": 7467760, "step": 22185 }, { "epoch": 17.1483771251932, "grad_norm": 0.8660011887550354, "learning_rate": 3.03456188848332e-06, "loss": 0.3457, "num_input_tokens_seen": 7469552, "step": 22190 }, { "epoch": 17.152241112828438, "grad_norm": 1.4789361953735352, "learning_rate": 3.026515875618702e-06, "loss": 0.4333, "num_input_tokens_seen": 7471376, "step": 22195 }, { "epoch": 17.15610510046368, "grad_norm": 1.7513344287872314, "learning_rate": 3.0184798564262513e-06, "loss": 0.7111, "num_input_tokens_seen": 7473008, "step": 22200 }, { "epoch": 17.15996908809892, "grad_norm": 0.8584872484207153, "learning_rate": 3.0104538345608085e-06, "loss": 0.6789, "num_input_tokens_seen": 7475056, "step": 22205 }, { "epoch": 17.163833075734157, "grad_norm": 0.8611422181129456, "learning_rate": 3.002437813672651e-06, "loss": 0.5185, "num_input_tokens_seen": 7476720, "step": 22210 }, { "epoch": 17.167697063369395, "grad_norm": 1.0721839666366577, "learning_rate": 2.9944317974075153e-06, "loss": 0.4715, "num_input_tokens_seen": 7478608, "step": 22215 }, { "epoch": 17.171561051004637, "grad_norm": 1.1747331619262695, "learning_rate": 2.9864357894065805e-06, "loss": 0.6111, "num_input_tokens_seen": 7480176, "step": 22220 }, { "epoch": 17.175425038639876, "grad_norm": 1.1373670101165771, "learning_rate": 2.978449793306487e-06, "loss": 0.3819, "num_input_tokens_seen": 7481744, "step": 22225 }, { "epoch": 17.179289026275114, "grad_norm": 1.0421512126922607, "learning_rate": 2.9704738127393078e-06, "loss": 0.4109, "num_input_tokens_seen": 7483376, "step": 22230 }, { "epoch": 17.183153013910356, "grad_norm": 1.347981333732605, "learning_rate": 2.962507851332566e-06, "loss": 0.6191, "num_input_tokens_seen": 7485008, "step": 22235 }, { "epoch": 17.187017001545595, "grad_norm": 1.5544500350952148, "learning_rate": 2.954551912709233e-06, "loss": 0.5448, "num_input_tokens_seen": 7486768, "step": 22240 }, { "epoch": 17.190880989180833, "grad_norm": 0.7477402687072754, "learning_rate": 2.9466060004877174e-06, "loss": 0.4219, "num_input_tokens_seen": 7488400, "step": 22245 }, { "epoch": 17.194744976816075, "grad_norm": 1.7919347286224365, "learning_rate": 2.938670118281864e-06, "loss": 0.4543, "num_input_tokens_seen": 7489872, "step": 22250 }, { "epoch": 17.198608964451314, "grad_norm": 0.8807328343391418, "learning_rate": 2.9307442697009606e-06, "loss": 0.4428, "num_input_tokens_seen": 7491536, "step": 22255 }, { "epoch": 17.202472952086552, "grad_norm": 1.069827914237976, "learning_rate": 2.922828458349727e-06, "loss": 0.4317, "num_input_tokens_seen": 7493168, "step": 22260 }, { "epoch": 17.206336939721794, "grad_norm": 1.3862390518188477, "learning_rate": 2.914922687828331e-06, "loss": 0.5903, "num_input_tokens_seen": 7495056, "step": 22265 }, { "epoch": 17.210200927357032, "grad_norm": 1.377577304840088, "learning_rate": 2.9070269617323537e-06, "loss": 0.3294, "num_input_tokens_seen": 7496752, "step": 22270 }, { "epoch": 17.21406491499227, "grad_norm": 1.705899715423584, "learning_rate": 2.8991412836528285e-06, "loss": 0.4729, "num_input_tokens_seen": 7498640, "step": 22275 }, { "epoch": 17.217928902627513, "grad_norm": 2.110386610031128, "learning_rate": 2.8912656571762036e-06, "loss": 0.4702, "num_input_tokens_seen": 7500272, "step": 22280 }, { "epoch": 17.22179289026275, "grad_norm": 1.3803167343139648, "learning_rate": 2.883400085884361e-06, "loss": 0.4981, "num_input_tokens_seen": 7502064, "step": 22285 }, { "epoch": 17.22565687789799, "grad_norm": 0.9117465615272522, "learning_rate": 2.8755445733546134e-06, "loss": 0.6264, "num_input_tokens_seen": 7503856, "step": 22290 }, { "epoch": 17.22952086553323, "grad_norm": 0.6271985769271851, "learning_rate": 2.8676991231596894e-06, "loss": 0.3583, "num_input_tokens_seen": 7505328, "step": 22295 }, { "epoch": 17.23338485316847, "grad_norm": 1.2434355020523071, "learning_rate": 2.859863738867746e-06, "loss": 0.4736, "num_input_tokens_seen": 7506864, "step": 22300 }, { "epoch": 17.23724884080371, "grad_norm": 1.829138159751892, "learning_rate": 2.8520384240423665e-06, "loss": 0.3671, "num_input_tokens_seen": 7508624, "step": 22305 }, { "epoch": 17.24111282843895, "grad_norm": 1.1410537958145142, "learning_rate": 2.8442231822425532e-06, "loss": 0.3525, "num_input_tokens_seen": 7510416, "step": 22310 }, { "epoch": 17.24497681607419, "grad_norm": 0.6872135996818542, "learning_rate": 2.836418017022724e-06, "loss": 0.401, "num_input_tokens_seen": 7512112, "step": 22315 }, { "epoch": 17.248840803709427, "grad_norm": 1.345947265625, "learning_rate": 2.8286229319327147e-06, "loss": 0.5662, "num_input_tokens_seen": 7513712, "step": 22320 }, { "epoch": 17.25270479134467, "grad_norm": 0.9952886700630188, "learning_rate": 2.8208379305177725e-06, "loss": 0.3739, "num_input_tokens_seen": 7515440, "step": 22325 }, { "epoch": 17.256568778979908, "grad_norm": 1.7551677227020264, "learning_rate": 2.813063016318565e-06, "loss": 0.8454, "num_input_tokens_seen": 7517136, "step": 22330 }, { "epoch": 17.260432766615146, "grad_norm": 1.0139062404632568, "learning_rate": 2.805298192871167e-06, "loss": 0.4403, "num_input_tokens_seen": 7518928, "step": 22335 }, { "epoch": 17.264296754250385, "grad_norm": 0.785887598991394, "learning_rate": 2.7975434637070698e-06, "loss": 0.4753, "num_input_tokens_seen": 7520432, "step": 22340 }, { "epoch": 17.268160741885627, "grad_norm": 1.1903069019317627, "learning_rate": 2.789798832353174e-06, "loss": 0.3899, "num_input_tokens_seen": 7522384, "step": 22345 }, { "epoch": 17.272024729520865, "grad_norm": 1.4635676145553589, "learning_rate": 2.7820643023317827e-06, "loss": 0.4009, "num_input_tokens_seen": 7524144, "step": 22350 }, { "epoch": 17.275888717156104, "grad_norm": 1.480864405632019, "learning_rate": 2.7743398771606034e-06, "loss": 0.6469, "num_input_tokens_seen": 7525840, "step": 22355 }, { "epoch": 17.279752704791346, "grad_norm": 1.1894928216934204, "learning_rate": 2.7666255603527535e-06, "loss": 0.398, "num_input_tokens_seen": 7527408, "step": 22360 }, { "epoch": 17.283616692426584, "grad_norm": 0.8001535534858704, "learning_rate": 2.7589213554167466e-06, "loss": 0.3497, "num_input_tokens_seen": 7528944, "step": 22365 }, { "epoch": 17.287480680061822, "grad_norm": 1.083371877670288, "learning_rate": 2.7512272658565012e-06, "loss": 0.4484, "num_input_tokens_seen": 7530768, "step": 22370 }, { "epoch": 17.291344667697064, "grad_norm": 0.9966205358505249, "learning_rate": 2.7435432951713443e-06, "loss": 0.3919, "num_input_tokens_seen": 7532592, "step": 22375 }, { "epoch": 17.295208655332303, "grad_norm": 1.1822764873504639, "learning_rate": 2.7358694468559766e-06, "loss": 0.3522, "num_input_tokens_seen": 7534320, "step": 22380 }, { "epoch": 17.29907264296754, "grad_norm": 1.1553986072540283, "learning_rate": 2.728205724400526e-06, "loss": 0.5182, "num_input_tokens_seen": 7536176, "step": 22385 }, { "epoch": 17.302936630602783, "grad_norm": 1.113404631614685, "learning_rate": 2.7205521312904937e-06, "loss": 0.3926, "num_input_tokens_seen": 7537872, "step": 22390 }, { "epoch": 17.30680061823802, "grad_norm": 1.039734959602356, "learning_rate": 2.712908671006775e-06, "loss": 0.3788, "num_input_tokens_seen": 7539600, "step": 22395 }, { "epoch": 17.31066460587326, "grad_norm": 0.8810380697250366, "learning_rate": 2.7052753470256683e-06, "loss": 0.5687, "num_input_tokens_seen": 7541488, "step": 22400 }, { "epoch": 17.314528593508502, "grad_norm": 1.1685197353363037, "learning_rate": 2.697652162818845e-06, "loss": 0.4793, "num_input_tokens_seen": 7542928, "step": 22405 }, { "epoch": 17.31839258114374, "grad_norm": 1.020884394645691, "learning_rate": 2.6900391218533882e-06, "loss": 0.5224, "num_input_tokens_seen": 7544464, "step": 22410 }, { "epoch": 17.32225656877898, "grad_norm": 1.317262887954712, "learning_rate": 2.6824362275917475e-06, "loss": 0.4399, "num_input_tokens_seen": 7546224, "step": 22415 }, { "epoch": 17.32612055641422, "grad_norm": 0.6408361196517944, "learning_rate": 2.6748434834917595e-06, "loss": 0.4263, "num_input_tokens_seen": 7547792, "step": 22420 }, { "epoch": 17.32998454404946, "grad_norm": 2.159876823425293, "learning_rate": 2.6672608930066596e-06, "loss": 0.4198, "num_input_tokens_seen": 7549360, "step": 22425 }, { "epoch": 17.333848531684698, "grad_norm": 0.9403836131095886, "learning_rate": 2.6596884595850523e-06, "loss": 0.5755, "num_input_tokens_seen": 7550832, "step": 22430 }, { "epoch": 17.33771251931994, "grad_norm": 1.3334453105926514, "learning_rate": 2.6521261866709224e-06, "loss": 0.4245, "num_input_tokens_seen": 7552592, "step": 22435 }, { "epoch": 17.341576506955178, "grad_norm": 1.6134852170944214, "learning_rate": 2.644574077703635e-06, "loss": 0.4229, "num_input_tokens_seen": 7554384, "step": 22440 }, { "epoch": 17.345440494590417, "grad_norm": 0.7616709470748901, "learning_rate": 2.637032136117945e-06, "loss": 0.395, "num_input_tokens_seen": 7556144, "step": 22445 }, { "epoch": 17.34930448222566, "grad_norm": 1.0119961500167847, "learning_rate": 2.6295003653439648e-06, "loss": 0.3532, "num_input_tokens_seen": 7557680, "step": 22450 }, { "epoch": 17.353168469860897, "grad_norm": 0.5677381753921509, "learning_rate": 2.62197876880719e-06, "loss": 0.3879, "num_input_tokens_seen": 7559344, "step": 22455 }, { "epoch": 17.357032457496135, "grad_norm": 0.7454410791397095, "learning_rate": 2.6144673499284842e-06, "loss": 0.4515, "num_input_tokens_seen": 7561040, "step": 22460 }, { "epoch": 17.360896445131374, "grad_norm": 0.9930897951126099, "learning_rate": 2.606966112124093e-06, "loss": 0.4284, "num_input_tokens_seen": 7562544, "step": 22465 }, { "epoch": 17.364760432766616, "grad_norm": 0.8918343782424927, "learning_rate": 2.59947505880562e-06, "loss": 0.4026, "num_input_tokens_seen": 7564240, "step": 22470 }, { "epoch": 17.368624420401854, "grad_norm": 1.1708869934082031, "learning_rate": 2.5919941933800373e-06, "loss": 0.385, "num_input_tokens_seen": 7566000, "step": 22475 }, { "epoch": 17.372488408037093, "grad_norm": 0.9218299388885498, "learning_rate": 2.5845235192496984e-06, "loss": 0.4688, "num_input_tokens_seen": 7567600, "step": 22480 }, { "epoch": 17.376352395672335, "grad_norm": 1.345497727394104, "learning_rate": 2.5770630398123026e-06, "loss": 0.4187, "num_input_tokens_seen": 7569712, "step": 22485 }, { "epoch": 17.380216383307573, "grad_norm": 1.2427594661712646, "learning_rate": 2.569612758460921e-06, "loss": 0.3999, "num_input_tokens_seen": 7571504, "step": 22490 }, { "epoch": 17.38408037094281, "grad_norm": 0.9987605810165405, "learning_rate": 2.5621726785839877e-06, "loss": 0.3424, "num_input_tokens_seen": 7573072, "step": 22495 }, { "epoch": 17.387944358578054, "grad_norm": 0.7411018013954163, "learning_rate": 2.554742803565291e-06, "loss": 0.4167, "num_input_tokens_seen": 7574960, "step": 22500 }, { "epoch": 17.391808346213292, "grad_norm": 0.9451136589050293, "learning_rate": 2.547323136783991e-06, "loss": 0.3674, "num_input_tokens_seen": 7576688, "step": 22505 }, { "epoch": 17.39567233384853, "grad_norm": 1.2387080192565918, "learning_rate": 2.539913681614589e-06, "loss": 0.7116, "num_input_tokens_seen": 7578288, "step": 22510 }, { "epoch": 17.399536321483772, "grad_norm": 0.6037672162055969, "learning_rate": 2.532514441426956e-06, "loss": 0.5067, "num_input_tokens_seen": 7580112, "step": 22515 }, { "epoch": 17.40340030911901, "grad_norm": 1.0488200187683105, "learning_rate": 2.525125419586308e-06, "loss": 0.4175, "num_input_tokens_seen": 7581584, "step": 22520 }, { "epoch": 17.40726429675425, "grad_norm": 1.4831690788269043, "learning_rate": 2.517746619453215e-06, "loss": 0.5663, "num_input_tokens_seen": 7583280, "step": 22525 }, { "epoch": 17.41112828438949, "grad_norm": 1.1281381845474243, "learning_rate": 2.510378044383602e-06, "loss": 0.4916, "num_input_tokens_seen": 7585072, "step": 22530 }, { "epoch": 17.41499227202473, "grad_norm": 1.0982571840286255, "learning_rate": 2.503019697728737e-06, "loss": 0.464, "num_input_tokens_seen": 7586704, "step": 22535 }, { "epoch": 17.418856259659968, "grad_norm": 0.840372622013092, "learning_rate": 2.4956715828352377e-06, "loss": 0.4438, "num_input_tokens_seen": 7588784, "step": 22540 }, { "epoch": 17.42272024729521, "grad_norm": 1.323749303817749, "learning_rate": 2.4883337030450786e-06, "loss": 0.484, "num_input_tokens_seen": 7590384, "step": 22545 }, { "epoch": 17.42658423493045, "grad_norm": 0.8156134486198425, "learning_rate": 2.4810060616955707e-06, "loss": 0.4929, "num_input_tokens_seen": 7592144, "step": 22550 }, { "epoch": 17.430448222565687, "grad_norm": 1.0373787879943848, "learning_rate": 2.4736886621193693e-06, "loss": 0.4415, "num_input_tokens_seen": 7593840, "step": 22555 }, { "epoch": 17.43431221020093, "grad_norm": 0.6399078965187073, "learning_rate": 2.466381507644469e-06, "loss": 0.4017, "num_input_tokens_seen": 7595600, "step": 22560 }, { "epoch": 17.438176197836167, "grad_norm": 1.5910331010818481, "learning_rate": 2.4590846015942053e-06, "loss": 0.383, "num_input_tokens_seen": 7597264, "step": 22565 }, { "epoch": 17.442040185471406, "grad_norm": 1.2679685354232788, "learning_rate": 2.451797947287257e-06, "loss": 0.3382, "num_input_tokens_seen": 7598768, "step": 22570 }, { "epoch": 17.445904173106648, "grad_norm": 1.4066678285598755, "learning_rate": 2.444521548037637e-06, "loss": 0.6156, "num_input_tokens_seen": 7600464, "step": 22575 }, { "epoch": 17.449768160741886, "grad_norm": 0.9076228141784668, "learning_rate": 2.437255407154693e-06, "loss": 0.3893, "num_input_tokens_seen": 7602160, "step": 22580 }, { "epoch": 17.453632148377125, "grad_norm": 0.9344832301139832, "learning_rate": 2.429999527943119e-06, "loss": 0.3699, "num_input_tokens_seen": 7603792, "step": 22585 }, { "epoch": 17.457496136012363, "grad_norm": 1.1139687299728394, "learning_rate": 2.422753913702924e-06, "loss": 0.4439, "num_input_tokens_seen": 7605520, "step": 22590 }, { "epoch": 17.461360123647605, "grad_norm": 0.9501067399978638, "learning_rate": 2.4155185677294607e-06, "loss": 0.4818, "num_input_tokens_seen": 7607216, "step": 22595 }, { "epoch": 17.465224111282843, "grad_norm": 0.8115643262863159, "learning_rate": 2.408293493313407e-06, "loss": 0.4726, "num_input_tokens_seen": 7609040, "step": 22600 }, { "epoch": 17.469088098918082, "grad_norm": 0.9902768731117249, "learning_rate": 2.4010786937407687e-06, "loss": 0.4011, "num_input_tokens_seen": 7610448, "step": 22605 }, { "epoch": 17.472952086553324, "grad_norm": 1.396833062171936, "learning_rate": 2.393874172292873e-06, "loss": 0.5208, "num_input_tokens_seen": 7611952, "step": 22610 }, { "epoch": 17.476816074188562, "grad_norm": 1.0116957426071167, "learning_rate": 2.386679932246394e-06, "loss": 0.4628, "num_input_tokens_seen": 7613424, "step": 22615 }, { "epoch": 17.4806800618238, "grad_norm": 0.6366719007492065, "learning_rate": 2.3794959768733e-06, "loss": 0.3872, "num_input_tokens_seen": 7615248, "step": 22620 }, { "epoch": 17.484544049459043, "grad_norm": 1.1501014232635498, "learning_rate": 2.3723223094409108e-06, "loss": 0.443, "num_input_tokens_seen": 7616912, "step": 22625 }, { "epoch": 17.48840803709428, "grad_norm": 0.8305791020393372, "learning_rate": 2.3651589332118474e-06, "loss": 0.4778, "num_input_tokens_seen": 7618576, "step": 22630 }, { "epoch": 17.49227202472952, "grad_norm": 0.8774911761283875, "learning_rate": 2.358005851444056e-06, "loss": 0.3355, "num_input_tokens_seen": 7620112, "step": 22635 }, { "epoch": 17.49613601236476, "grad_norm": 0.9079105854034424, "learning_rate": 2.3508630673908017e-06, "loss": 0.3976, "num_input_tokens_seen": 7621744, "step": 22640 }, { "epoch": 17.5, "grad_norm": 0.8518325090408325, "learning_rate": 2.3437305843006604e-06, "loss": 0.4122, "num_input_tokens_seen": 7623408, "step": 22645 }, { "epoch": 17.50386398763524, "grad_norm": 1.000963568687439, "learning_rate": 2.336608405417534e-06, "loss": 0.4677, "num_input_tokens_seen": 7625328, "step": 22650 }, { "epoch": 17.50772797527048, "grad_norm": 0.6841045618057251, "learning_rate": 2.3294965339806324e-06, "loss": 0.407, "num_input_tokens_seen": 7627120, "step": 22655 }, { "epoch": 17.51159196290572, "grad_norm": 0.8696278929710388, "learning_rate": 2.3223949732244704e-06, "loss": 0.4978, "num_input_tokens_seen": 7628912, "step": 22660 }, { "epoch": 17.515455950540957, "grad_norm": 1.2364243268966675, "learning_rate": 2.3153037263788925e-06, "loss": 0.4044, "num_input_tokens_seen": 7630640, "step": 22665 }, { "epoch": 17.5193199381762, "grad_norm": 1.7284005880355835, "learning_rate": 2.3082227966690297e-06, "loss": 0.5442, "num_input_tokens_seen": 7632368, "step": 22670 }, { "epoch": 17.523183925811438, "grad_norm": 0.8788689374923706, "learning_rate": 2.3011521873153364e-06, "loss": 0.649, "num_input_tokens_seen": 7634192, "step": 22675 }, { "epoch": 17.527047913446676, "grad_norm": 1.2793320417404175, "learning_rate": 2.29409190153356e-06, "loss": 0.5443, "num_input_tokens_seen": 7635952, "step": 22680 }, { "epoch": 17.530911901081918, "grad_norm": 0.8544217944145203, "learning_rate": 2.287041942534773e-06, "loss": 0.3572, "num_input_tokens_seen": 7637776, "step": 22685 }, { "epoch": 17.534775888717157, "grad_norm": 0.7625585198402405, "learning_rate": 2.280002313525334e-06, "loss": 0.4454, "num_input_tokens_seen": 7639440, "step": 22690 }, { "epoch": 17.538639876352395, "grad_norm": 1.3760685920715332, "learning_rate": 2.2729730177069086e-06, "loss": 0.4485, "num_input_tokens_seen": 7641488, "step": 22695 }, { "epoch": 17.542503863987637, "grad_norm": 0.7142928242683411, "learning_rate": 2.2659540582764593e-06, "loss": 0.3376, "num_input_tokens_seen": 7642896, "step": 22700 }, { "epoch": 17.546367851622875, "grad_norm": 2.2525949478149414, "learning_rate": 2.2589454384262494e-06, "loss": 0.5561, "num_input_tokens_seen": 7644592, "step": 22705 }, { "epoch": 17.550231839258114, "grad_norm": 0.839066743850708, "learning_rate": 2.2519471613438482e-06, "loss": 0.3968, "num_input_tokens_seen": 7646192, "step": 22710 }, { "epoch": 17.554095826893352, "grad_norm": 1.0556576251983643, "learning_rate": 2.244959230212107e-06, "loss": 0.3754, "num_input_tokens_seen": 7647728, "step": 22715 }, { "epoch": 17.557959814528594, "grad_norm": 1.292657732963562, "learning_rate": 2.2379816482091866e-06, "loss": 0.4229, "num_input_tokens_seen": 7649520, "step": 22720 }, { "epoch": 17.561823802163833, "grad_norm": 0.9850688576698303, "learning_rate": 2.2310144185085314e-06, "loss": 0.444, "num_input_tokens_seen": 7651088, "step": 22725 }, { "epoch": 17.56568778979907, "grad_norm": 1.0809812545776367, "learning_rate": 2.2240575442788735e-06, "loss": 0.4014, "num_input_tokens_seen": 7652816, "step": 22730 }, { "epoch": 17.569551777434313, "grad_norm": 0.9440416693687439, "learning_rate": 2.217111028684246e-06, "loss": 0.4621, "num_input_tokens_seen": 7654256, "step": 22735 }, { "epoch": 17.57341576506955, "grad_norm": 1.0354974269866943, "learning_rate": 2.2101748748839633e-06, "loss": 0.4262, "num_input_tokens_seen": 7656048, "step": 22740 }, { "epoch": 17.57727975270479, "grad_norm": 0.8927879333496094, "learning_rate": 2.203249086032627e-06, "loss": 0.3658, "num_input_tokens_seen": 7657616, "step": 22745 }, { "epoch": 17.581143740340032, "grad_norm": 1.721167802810669, "learning_rate": 2.196333665280134e-06, "loss": 0.4683, "num_input_tokens_seen": 7659216, "step": 22750 }, { "epoch": 17.58500772797527, "grad_norm": 1.3876813650131226, "learning_rate": 2.189428615771652e-06, "loss": 0.7458, "num_input_tokens_seen": 7660912, "step": 22755 }, { "epoch": 17.58887171561051, "grad_norm": 0.8067340850830078, "learning_rate": 2.182533940647649e-06, "loss": 0.4813, "num_input_tokens_seen": 7662672, "step": 22760 }, { "epoch": 17.59273570324575, "grad_norm": 0.742865800857544, "learning_rate": 2.175649643043856e-06, "loss": 0.4189, "num_input_tokens_seen": 7664432, "step": 22765 }, { "epoch": 17.59659969088099, "grad_norm": 1.2508043050765991, "learning_rate": 2.168775726091296e-06, "loss": 0.3791, "num_input_tokens_seen": 7665968, "step": 22770 }, { "epoch": 17.600463678516228, "grad_norm": 1.3137646913528442, "learning_rate": 2.1619121929162654e-06, "loss": 0.4141, "num_input_tokens_seen": 7667600, "step": 22775 }, { "epoch": 17.60432766615147, "grad_norm": 0.9070088863372803, "learning_rate": 2.155059046640337e-06, "loss": 0.5296, "num_input_tokens_seen": 7669200, "step": 22780 }, { "epoch": 17.608191653786708, "grad_norm": 0.5502965450286865, "learning_rate": 2.1482162903803725e-06, "loss": 0.4866, "num_input_tokens_seen": 7670864, "step": 22785 }, { "epoch": 17.612055641421946, "grad_norm": 1.0247750282287598, "learning_rate": 2.1413839272484887e-06, "loss": 0.4232, "num_input_tokens_seen": 7672432, "step": 22790 }, { "epoch": 17.61591962905719, "grad_norm": 0.8295356631278992, "learning_rate": 2.1345619603520937e-06, "loss": 0.4946, "num_input_tokens_seen": 7674000, "step": 22795 }, { "epoch": 17.619783616692427, "grad_norm": 1.2724748849868774, "learning_rate": 2.1277503927938535e-06, "loss": 0.3746, "num_input_tokens_seen": 7675600, "step": 22800 }, { "epoch": 17.623647604327665, "grad_norm": 1.313180923461914, "learning_rate": 2.1209492276717148e-06, "loss": 0.401, "num_input_tokens_seen": 7677200, "step": 22805 }, { "epoch": 17.627511591962907, "grad_norm": 0.7435005307197571, "learning_rate": 2.1141584680788805e-06, "loss": 0.3798, "num_input_tokens_seen": 7678896, "step": 22810 }, { "epoch": 17.631375579598146, "grad_norm": 1.3200117349624634, "learning_rate": 2.107378117103831e-06, "loss": 0.4431, "num_input_tokens_seen": 7680560, "step": 22815 }, { "epoch": 17.635239567233384, "grad_norm": 2.261898994445801, "learning_rate": 2.1006081778303157e-06, "loss": 0.451, "num_input_tokens_seen": 7682224, "step": 22820 }, { "epoch": 17.639103554868626, "grad_norm": 0.9147229790687561, "learning_rate": 2.093848653337335e-06, "loss": 0.5001, "num_input_tokens_seen": 7683824, "step": 22825 }, { "epoch": 17.642967542503865, "grad_norm": 0.6503116488456726, "learning_rate": 2.087099546699173e-06, "loss": 0.3862, "num_input_tokens_seen": 7685392, "step": 22830 }, { "epoch": 17.646831530139103, "grad_norm": 1.225074291229248, "learning_rate": 2.080360860985356e-06, "loss": 0.4068, "num_input_tokens_seen": 7687120, "step": 22835 }, { "epoch": 17.65069551777434, "grad_norm": 1.2681758403778076, "learning_rate": 2.0736325992606804e-06, "loss": 0.4131, "num_input_tokens_seen": 7688848, "step": 22840 }, { "epoch": 17.654559505409583, "grad_norm": 0.696863055229187, "learning_rate": 2.0669147645851984e-06, "loss": 0.3446, "num_input_tokens_seen": 7690960, "step": 22845 }, { "epoch": 17.658423493044822, "grad_norm": 0.6745784878730774, "learning_rate": 2.060207360014224e-06, "loss": 0.3595, "num_input_tokens_seen": 7692624, "step": 22850 }, { "epoch": 17.66228748068006, "grad_norm": 1.3801931142807007, "learning_rate": 2.0535103885983177e-06, "loss": 0.38, "num_input_tokens_seen": 7694352, "step": 22855 }, { "epoch": 17.666151468315302, "grad_norm": 0.7080813646316528, "learning_rate": 2.0468238533833117e-06, "loss": 0.417, "num_input_tokens_seen": 7696112, "step": 22860 }, { "epoch": 17.67001545595054, "grad_norm": 0.8550204634666443, "learning_rate": 2.040147757410274e-06, "loss": 0.3802, "num_input_tokens_seen": 7697808, "step": 22865 }, { "epoch": 17.67387944358578, "grad_norm": 0.7938693165779114, "learning_rate": 2.033482103715542e-06, "loss": 0.4325, "num_input_tokens_seen": 7699440, "step": 22870 }, { "epoch": 17.67774343122102, "grad_norm": 0.9341617226600647, "learning_rate": 2.026826895330691e-06, "loss": 0.426, "num_input_tokens_seen": 7701200, "step": 22875 }, { "epoch": 17.68160741885626, "grad_norm": 0.9081515669822693, "learning_rate": 2.020182135282547e-06, "loss": 0.4547, "num_input_tokens_seen": 7702992, "step": 22880 }, { "epoch": 17.685471406491498, "grad_norm": 0.6913585066795349, "learning_rate": 2.0135478265931902e-06, "loss": 0.5077, "num_input_tokens_seen": 7704720, "step": 22885 }, { "epoch": 17.68933539412674, "grad_norm": 1.2401530742645264, "learning_rate": 2.0069239722799392e-06, "loss": 0.3798, "num_input_tokens_seen": 7706480, "step": 22890 }, { "epoch": 17.69319938176198, "grad_norm": 0.9119266271591187, "learning_rate": 2.0003105753553685e-06, "loss": 0.5939, "num_input_tokens_seen": 7707984, "step": 22895 }, { "epoch": 17.697063369397217, "grad_norm": 1.0680357217788696, "learning_rate": 1.9937076388272857e-06, "loss": 0.3928, "num_input_tokens_seen": 7709840, "step": 22900 }, { "epoch": 17.70092735703246, "grad_norm": 1.1313154697418213, "learning_rate": 1.987115165698747e-06, "loss": 0.5266, "num_input_tokens_seen": 7711536, "step": 22905 }, { "epoch": 17.704791344667697, "grad_norm": 1.5778621435165405, "learning_rate": 1.9805331589680538e-06, "loss": 0.493, "num_input_tokens_seen": 7713232, "step": 22910 }, { "epoch": 17.708655332302936, "grad_norm": 0.9246322512626648, "learning_rate": 1.9739616216287365e-06, "loss": 0.4217, "num_input_tokens_seen": 7714608, "step": 22915 }, { "epoch": 17.712519319938178, "grad_norm": 0.9394572973251343, "learning_rate": 1.9674005566695714e-06, "loss": 0.361, "num_input_tokens_seen": 7716304, "step": 22920 }, { "epoch": 17.716383307573416, "grad_norm": 1.3034591674804688, "learning_rate": 1.9608499670745686e-06, "loss": 0.5374, "num_input_tokens_seen": 7718032, "step": 22925 }, { "epoch": 17.720247295208654, "grad_norm": 0.8357017040252686, "learning_rate": 1.9543098558229776e-06, "loss": 0.5351, "num_input_tokens_seen": 7719760, "step": 22930 }, { "epoch": 17.724111282843896, "grad_norm": 0.9201805591583252, "learning_rate": 1.9477802258892812e-06, "loss": 0.5508, "num_input_tokens_seen": 7721584, "step": 22935 }, { "epoch": 17.727975270479135, "grad_norm": 0.7489703297615051, "learning_rate": 1.9412610802431923e-06, "loss": 0.5478, "num_input_tokens_seen": 7723216, "step": 22940 }, { "epoch": 17.731839258114373, "grad_norm": 1.0206284523010254, "learning_rate": 1.9347524218496505e-06, "loss": 0.4874, "num_input_tokens_seen": 7724976, "step": 22945 }, { "epoch": 17.735703245749615, "grad_norm": 0.7933930158615112, "learning_rate": 1.928254253668846e-06, "loss": 0.3232, "num_input_tokens_seen": 7726800, "step": 22950 }, { "epoch": 17.739567233384854, "grad_norm": 1.3318523168563843, "learning_rate": 1.9217665786561783e-06, "loss": 0.7358, "num_input_tokens_seen": 7728432, "step": 22955 }, { "epoch": 17.743431221020092, "grad_norm": 1.077871561050415, "learning_rate": 1.9152893997622766e-06, "loss": 0.4295, "num_input_tokens_seen": 7730128, "step": 22960 }, { "epoch": 17.74729520865533, "grad_norm": 0.9427162408828735, "learning_rate": 1.9088227199330095e-06, "loss": 0.3679, "num_input_tokens_seen": 7731664, "step": 22965 }, { "epoch": 17.751159196290573, "grad_norm": 2.072716236114502, "learning_rate": 1.9023665421094572e-06, "loss": 0.6547, "num_input_tokens_seen": 7733072, "step": 22970 }, { "epoch": 17.75502318392581, "grad_norm": 1.328851580619812, "learning_rate": 1.8959208692279267e-06, "loss": 0.338, "num_input_tokens_seen": 7734800, "step": 22975 }, { "epoch": 17.75888717156105, "grad_norm": 0.8709526062011719, "learning_rate": 1.889485704219951e-06, "loss": 0.5248, "num_input_tokens_seen": 7736816, "step": 22980 }, { "epoch": 17.76275115919629, "grad_norm": 1.0483015775680542, "learning_rate": 1.8830610500122748e-06, "loss": 0.2928, "num_input_tokens_seen": 7738608, "step": 22985 }, { "epoch": 17.76661514683153, "grad_norm": 1.9795303344726562, "learning_rate": 1.87664690952688e-06, "loss": 0.3791, "num_input_tokens_seen": 7740400, "step": 22990 }, { "epoch": 17.77047913446677, "grad_norm": 0.7707388997077942, "learning_rate": 1.870243285680945e-06, "loss": 0.6754, "num_input_tokens_seen": 7742160, "step": 22995 }, { "epoch": 17.77434312210201, "grad_norm": 1.153915524482727, "learning_rate": 1.8638501813868892e-06, "loss": 0.4562, "num_input_tokens_seen": 7743792, "step": 23000 }, { "epoch": 17.77820710973725, "grad_norm": 1.0190480947494507, "learning_rate": 1.8574675995523261e-06, "loss": 0.3423, "num_input_tokens_seen": 7745392, "step": 23005 }, { "epoch": 17.782071097372487, "grad_norm": 0.8014883995056152, "learning_rate": 1.8510955430800948e-06, "loss": 0.3122, "num_input_tokens_seen": 7746896, "step": 23010 }, { "epoch": 17.78593508500773, "grad_norm": 1.0162169933319092, "learning_rate": 1.8447340148682435e-06, "loss": 0.4333, "num_input_tokens_seen": 7748624, "step": 23015 }, { "epoch": 17.789799072642968, "grad_norm": 1.1231915950775146, "learning_rate": 1.8383830178100358e-06, "loss": 0.4915, "num_input_tokens_seen": 7750288, "step": 23020 }, { "epoch": 17.793663060278206, "grad_norm": 1.2249834537506104, "learning_rate": 1.8320425547939335e-06, "loss": 0.3612, "num_input_tokens_seen": 7751920, "step": 23025 }, { "epoch": 17.797527047913448, "grad_norm": 0.6158238053321838, "learning_rate": 1.8257126287036269e-06, "loss": 0.5104, "num_input_tokens_seen": 7753488, "step": 23030 }, { "epoch": 17.801391035548686, "grad_norm": 1.5878280401229858, "learning_rate": 1.819393242418005e-06, "loss": 0.4408, "num_input_tokens_seen": 7755312, "step": 23035 }, { "epoch": 17.805255023183925, "grad_norm": 0.9027120471000671, "learning_rate": 1.81308439881116e-06, "loss": 0.4688, "num_input_tokens_seen": 7756816, "step": 23040 }, { "epoch": 17.809119010819167, "grad_norm": 1.216827392578125, "learning_rate": 1.8067861007523918e-06, "loss": 0.4137, "num_input_tokens_seen": 7758704, "step": 23045 }, { "epoch": 17.812982998454405, "grad_norm": 2.3739922046661377, "learning_rate": 1.8004983511062057e-06, "loss": 0.5043, "num_input_tokens_seen": 7760240, "step": 23050 }, { "epoch": 17.816846986089644, "grad_norm": 0.9763891100883484, "learning_rate": 1.7942211527323034e-06, "loss": 0.5662, "num_input_tokens_seen": 7761712, "step": 23055 }, { "epoch": 17.820710973724886, "grad_norm": 1.221765398979187, "learning_rate": 1.7879545084855898e-06, "loss": 0.6455, "num_input_tokens_seen": 7763536, "step": 23060 }, { "epoch": 17.824574961360124, "grad_norm": 0.8515592217445374, "learning_rate": 1.7816984212161797e-06, "loss": 0.4205, "num_input_tokens_seen": 7765552, "step": 23065 }, { "epoch": 17.828438948995363, "grad_norm": 0.9260168671607971, "learning_rate": 1.7754528937693777e-06, "loss": 0.4418, "num_input_tokens_seen": 7767344, "step": 23070 }, { "epoch": 17.832302936630605, "grad_norm": 0.7259600162506104, "learning_rate": 1.7692179289856892e-06, "loss": 0.7168, "num_input_tokens_seen": 7768912, "step": 23075 }, { "epoch": 17.836166924265843, "grad_norm": 1.579936146736145, "learning_rate": 1.7629935297008071e-06, "loss": 0.4233, "num_input_tokens_seen": 7770512, "step": 23080 }, { "epoch": 17.84003091190108, "grad_norm": 0.9256331324577332, "learning_rate": 1.756779698745631e-06, "loss": 0.3817, "num_input_tokens_seen": 7772112, "step": 23085 }, { "epoch": 17.84389489953632, "grad_norm": 0.6817066669464111, "learning_rate": 1.750576438946247e-06, "loss": 0.3809, "num_input_tokens_seen": 7773648, "step": 23090 }, { "epoch": 17.847758887171562, "grad_norm": 0.7578368782997131, "learning_rate": 1.7443837531239264e-06, "loss": 0.506, "num_input_tokens_seen": 7775248, "step": 23095 }, { "epoch": 17.8516228748068, "grad_norm": 0.8542349934577942, "learning_rate": 1.7382016440951554e-06, "loss": 0.6244, "num_input_tokens_seen": 7776944, "step": 23100 }, { "epoch": 17.85548686244204, "grad_norm": 1.041899561882019, "learning_rate": 1.732030114671579e-06, "loss": 0.3896, "num_input_tokens_seen": 7778672, "step": 23105 }, { "epoch": 17.85935085007728, "grad_norm": 1.0322024822235107, "learning_rate": 1.7258691676600575e-06, "loss": 0.4808, "num_input_tokens_seen": 7780336, "step": 23110 }, { "epoch": 17.86321483771252, "grad_norm": 1.5172780752182007, "learning_rate": 1.7197188058626217e-06, "loss": 0.51, "num_input_tokens_seen": 7781840, "step": 23115 }, { "epoch": 17.867078825347757, "grad_norm": 1.462204933166504, "learning_rate": 1.7135790320764955e-06, "loss": 0.3525, "num_input_tokens_seen": 7783440, "step": 23120 }, { "epoch": 17.870942812983, "grad_norm": 0.9154068231582642, "learning_rate": 1.707449849094081e-06, "loss": 0.4145, "num_input_tokens_seen": 7785232, "step": 23125 }, { "epoch": 17.874806800618238, "grad_norm": 1.1046549081802368, "learning_rate": 1.7013312597029623e-06, "loss": 0.3501, "num_input_tokens_seen": 7786608, "step": 23130 }, { "epoch": 17.878670788253476, "grad_norm": 1.2630342245101929, "learning_rate": 1.6952232666859247e-06, "loss": 0.4093, "num_input_tokens_seen": 7788304, "step": 23135 }, { "epoch": 17.88253477588872, "grad_norm": 1.5321561098098755, "learning_rate": 1.6891258728209097e-06, "loss": 0.4688, "num_input_tokens_seen": 7789936, "step": 23140 }, { "epoch": 17.886398763523957, "grad_norm": 0.9917981624603271, "learning_rate": 1.6830390808810465e-06, "loss": 0.4396, "num_input_tokens_seen": 7791600, "step": 23145 }, { "epoch": 17.890262751159195, "grad_norm": 1.1672194004058838, "learning_rate": 1.6769628936346566e-06, "loss": 0.4388, "num_input_tokens_seen": 7793008, "step": 23150 }, { "epoch": 17.894126738794437, "grad_norm": 0.9105015993118286, "learning_rate": 1.6708973138452155e-06, "loss": 0.3551, "num_input_tokens_seen": 7794736, "step": 23155 }, { "epoch": 17.897990726429676, "grad_norm": 1.8573616743087769, "learning_rate": 1.664842344271389e-06, "loss": 0.5652, "num_input_tokens_seen": 7796240, "step": 23160 }, { "epoch": 17.901854714064914, "grad_norm": 1.731489896774292, "learning_rate": 1.6587979876670102e-06, "loss": 0.572, "num_input_tokens_seen": 7798032, "step": 23165 }, { "epoch": 17.905718701700156, "grad_norm": 0.9431066513061523, "learning_rate": 1.6527642467810967e-06, "loss": 0.3836, "num_input_tokens_seen": 7799536, "step": 23170 }, { "epoch": 17.909582689335394, "grad_norm": 1.435091495513916, "learning_rate": 1.6467411243578229e-06, "loss": 0.3725, "num_input_tokens_seen": 7801328, "step": 23175 }, { "epoch": 17.913446676970633, "grad_norm": 1.1272635459899902, "learning_rate": 1.6407286231365449e-06, "loss": 0.5404, "num_input_tokens_seen": 7802992, "step": 23180 }, { "epoch": 17.917310664605875, "grad_norm": 0.9461513161659241, "learning_rate": 1.6347267458517752e-06, "loss": 0.4064, "num_input_tokens_seen": 7804784, "step": 23185 }, { "epoch": 17.921174652241113, "grad_norm": 0.886171281337738, "learning_rate": 1.6287354952332162e-06, "loss": 0.3525, "num_input_tokens_seen": 7806512, "step": 23190 }, { "epoch": 17.92503863987635, "grad_norm": 1.7028144598007202, "learning_rate": 1.6227548740057191e-06, "loss": 0.3981, "num_input_tokens_seen": 7808176, "step": 23195 }, { "epoch": 17.92890262751159, "grad_norm": 0.7722470760345459, "learning_rate": 1.6167848848893024e-06, "loss": 0.4272, "num_input_tokens_seen": 7809712, "step": 23200 }, { "epoch": 17.932766615146832, "grad_norm": 0.8832216858863831, "learning_rate": 1.610825530599161e-06, "loss": 0.4092, "num_input_tokens_seen": 7811376, "step": 23205 }, { "epoch": 17.93663060278207, "grad_norm": 0.9507635831832886, "learning_rate": 1.6048768138456406e-06, "loss": 0.3803, "num_input_tokens_seen": 7813040, "step": 23210 }, { "epoch": 17.94049459041731, "grad_norm": 0.9221833348274231, "learning_rate": 1.5989387373342518e-06, "loss": 0.3764, "num_input_tokens_seen": 7814480, "step": 23215 }, { "epoch": 17.94435857805255, "grad_norm": 0.8478395938873291, "learning_rate": 1.593011303765668e-06, "loss": 0.3211, "num_input_tokens_seen": 7816080, "step": 23220 }, { "epoch": 17.94822256568779, "grad_norm": 1.1320866346359253, "learning_rate": 1.5870945158357214e-06, "loss": 0.3373, "num_input_tokens_seen": 7817680, "step": 23225 }, { "epoch": 17.952086553323028, "grad_norm": 1.1034681797027588, "learning_rate": 1.581188376235404e-06, "loss": 0.3888, "num_input_tokens_seen": 7819216, "step": 23230 }, { "epoch": 17.95595054095827, "grad_norm": 0.7122790813446045, "learning_rate": 1.5752928876508615e-06, "loss": 0.3293, "num_input_tokens_seen": 7821136, "step": 23235 }, { "epoch": 17.95981452859351, "grad_norm": 1.1801555156707764, "learning_rate": 1.569408052763402e-06, "loss": 0.3475, "num_input_tokens_seen": 7822960, "step": 23240 }, { "epoch": 17.963678516228747, "grad_norm": 0.8510901927947998, "learning_rate": 1.5635338742494787e-06, "loss": 0.4195, "num_input_tokens_seen": 7824784, "step": 23245 }, { "epoch": 17.96754250386399, "grad_norm": 1.9706987142562866, "learning_rate": 1.5576703547807075e-06, "loss": 0.3836, "num_input_tokens_seen": 7826352, "step": 23250 }, { "epoch": 17.971406491499227, "grad_norm": 0.7297654151916504, "learning_rate": 1.5518174970238496e-06, "loss": 0.61, "num_input_tokens_seen": 7828464, "step": 23255 }, { "epoch": 17.975270479134466, "grad_norm": 0.7560098767280579, "learning_rate": 1.5459753036408175e-06, "loss": 0.3257, "num_input_tokens_seen": 7830128, "step": 23260 }, { "epoch": 17.979134466769708, "grad_norm": 1.356762409210205, "learning_rate": 1.5401437772886745e-06, "loss": 0.454, "num_input_tokens_seen": 7831920, "step": 23265 }, { "epoch": 17.982998454404946, "grad_norm": 1.9123632907867432, "learning_rate": 1.534322920619638e-06, "loss": 0.4266, "num_input_tokens_seen": 7833456, "step": 23270 }, { "epoch": 17.986862442040184, "grad_norm": 1.7537733316421509, "learning_rate": 1.5285127362810708e-06, "loss": 0.5443, "num_input_tokens_seen": 7835024, "step": 23275 }, { "epoch": 17.990726429675426, "grad_norm": 1.4739254713058472, "learning_rate": 1.5227132269154787e-06, "loss": 0.4538, "num_input_tokens_seen": 7836624, "step": 23280 }, { "epoch": 17.994590417310665, "grad_norm": 1.1776708364486694, "learning_rate": 1.5169243951605071e-06, "loss": 0.4711, "num_input_tokens_seen": 7838448, "step": 23285 }, { "epoch": 17.998454404945903, "grad_norm": 1.4411441087722778, "learning_rate": 1.5111462436489587e-06, "loss": 0.5031, "num_input_tokens_seen": 7840368, "step": 23290 }, { "epoch": 18.0, "eval_loss": 0.461969256401062, "eval_runtime": 6.3685, "eval_samples_per_second": 90.288, "eval_steps_per_second": 22.611, "num_input_tokens_seen": 7840784, "step": 23292 }, { "epoch": 18.002318392581145, "grad_norm": 0.8297837972640991, "learning_rate": 1.5053787750087645e-06, "loss": 0.7468, "num_input_tokens_seen": 7841840, "step": 23295 }, { "epoch": 18.006182380216384, "grad_norm": 0.6900520920753479, "learning_rate": 1.4996219918630068e-06, "loss": 0.392, "num_input_tokens_seen": 7843472, "step": 23300 }, { "epoch": 18.010046367851622, "grad_norm": 0.7359381914138794, "learning_rate": 1.4938758968299022e-06, "loss": 0.3365, "num_input_tokens_seen": 7845072, "step": 23305 }, { "epoch": 18.013910355486864, "grad_norm": 1.2789736986160278, "learning_rate": 1.4881404925228187e-06, "loss": 0.4243, "num_input_tokens_seen": 7846864, "step": 23310 }, { "epoch": 18.017774343122102, "grad_norm": 1.1875879764556885, "learning_rate": 1.4824157815502448e-06, "loss": 0.4049, "num_input_tokens_seen": 7848336, "step": 23315 }, { "epoch": 18.02163833075734, "grad_norm": 1.5029408931732178, "learning_rate": 1.4767017665158145e-06, "loss": 0.49, "num_input_tokens_seen": 7849968, "step": 23320 }, { "epoch": 18.025502318392583, "grad_norm": 0.721777617931366, "learning_rate": 1.4709984500182987e-06, "loss": 0.6296, "num_input_tokens_seen": 7851632, "step": 23325 }, { "epoch": 18.02936630602782, "grad_norm": 1.0221766233444214, "learning_rate": 1.4653058346515953e-06, "loss": 0.4156, "num_input_tokens_seen": 7853296, "step": 23330 }, { "epoch": 18.03323029366306, "grad_norm": 0.7679873108863831, "learning_rate": 1.4596239230047381e-06, "loss": 0.4729, "num_input_tokens_seen": 7855184, "step": 23335 }, { "epoch": 18.037094281298298, "grad_norm": 0.74310702085495, "learning_rate": 1.453952717661905e-06, "loss": 0.3665, "num_input_tokens_seen": 7856688, "step": 23340 }, { "epoch": 18.04095826893354, "grad_norm": 0.8607534766197205, "learning_rate": 1.4482922212023797e-06, "loss": 0.4385, "num_input_tokens_seen": 7858256, "step": 23345 }, { "epoch": 18.04482225656878, "grad_norm": 0.6136468648910522, "learning_rate": 1.4426424362006057e-06, "loss": 0.4242, "num_input_tokens_seen": 7860080, "step": 23350 }, { "epoch": 18.048686244204017, "grad_norm": 0.9425386190414429, "learning_rate": 1.4370033652261277e-06, "loss": 0.4342, "num_input_tokens_seen": 7861936, "step": 23355 }, { "epoch": 18.05255023183926, "grad_norm": 1.056610345840454, "learning_rate": 1.4313750108436359e-06, "loss": 0.4913, "num_input_tokens_seen": 7863568, "step": 23360 }, { "epoch": 18.056414219474497, "grad_norm": 1.266706109046936, "learning_rate": 1.4257573756129321e-06, "loss": 0.496, "num_input_tokens_seen": 7865360, "step": 23365 }, { "epoch": 18.060278207109736, "grad_norm": 1.001253604888916, "learning_rate": 1.4201504620889538e-06, "loss": 0.3912, "num_input_tokens_seen": 7867088, "step": 23370 }, { "epoch": 18.064142194744978, "grad_norm": 1.264996886253357, "learning_rate": 1.4145542728217637e-06, "loss": 0.4181, "num_input_tokens_seen": 7868816, "step": 23375 }, { "epoch": 18.068006182380216, "grad_norm": 0.762334942817688, "learning_rate": 1.4089688103565368e-06, "loss": 0.41, "num_input_tokens_seen": 7870288, "step": 23380 }, { "epoch": 18.071870170015455, "grad_norm": 1.2239676713943481, "learning_rate": 1.4033940772335719e-06, "loss": 0.4085, "num_input_tokens_seen": 7871920, "step": 23385 }, { "epoch": 18.075734157650697, "grad_norm": 1.1536519527435303, "learning_rate": 1.397830075988299e-06, "loss": 0.5129, "num_input_tokens_seen": 7873872, "step": 23390 }, { "epoch": 18.079598145285935, "grad_norm": 1.0342321395874023, "learning_rate": 1.392276809151255e-06, "loss": 0.3919, "num_input_tokens_seen": 7875856, "step": 23395 }, { "epoch": 18.083462132921174, "grad_norm": 0.9676990509033203, "learning_rate": 1.3867342792481003e-06, "loss": 0.3637, "num_input_tokens_seen": 7877552, "step": 23400 }, { "epoch": 18.087326120556416, "grad_norm": 0.731855034828186, "learning_rate": 1.3812024887996045e-06, "loss": 0.3061, "num_input_tokens_seen": 7879216, "step": 23405 }, { "epoch": 18.091190108191654, "grad_norm": 1.0883334875106812, "learning_rate": 1.3756814403216688e-06, "loss": 0.3575, "num_input_tokens_seen": 7880720, "step": 23410 }, { "epoch": 18.095054095826892, "grad_norm": 0.8783860802650452, "learning_rate": 1.3701711363252962e-06, "loss": 0.371, "num_input_tokens_seen": 7882640, "step": 23415 }, { "epoch": 18.098918083462134, "grad_norm": 0.7250942587852478, "learning_rate": 1.3646715793166037e-06, "loss": 0.3436, "num_input_tokens_seen": 7884176, "step": 23420 }, { "epoch": 18.102782071097373, "grad_norm": 1.4472906589508057, "learning_rate": 1.3591827717968186e-06, "loss": 0.4567, "num_input_tokens_seen": 7885808, "step": 23425 }, { "epoch": 18.10664605873261, "grad_norm": 0.874907910823822, "learning_rate": 1.3537047162622912e-06, "loss": 0.4426, "num_input_tokens_seen": 7887376, "step": 23430 }, { "epoch": 18.110510046367853, "grad_norm": 0.7464891076087952, "learning_rate": 1.348237415204473e-06, "loss": 0.3672, "num_input_tokens_seen": 7889360, "step": 23435 }, { "epoch": 18.11437403400309, "grad_norm": 1.1311688423156738, "learning_rate": 1.3427808711099165e-06, "loss": 0.4751, "num_input_tokens_seen": 7891120, "step": 23440 }, { "epoch": 18.11823802163833, "grad_norm": 0.8809656500816345, "learning_rate": 1.3373350864603034e-06, "loss": 0.4752, "num_input_tokens_seen": 7892880, "step": 23445 }, { "epoch": 18.122102009273572, "grad_norm": 0.9410935044288635, "learning_rate": 1.3319000637324025e-06, "loss": 0.598, "num_input_tokens_seen": 7894544, "step": 23450 }, { "epoch": 18.12596599690881, "grad_norm": 0.9320741891860962, "learning_rate": 1.3264758053980974e-06, "loss": 0.386, "num_input_tokens_seen": 7896336, "step": 23455 }, { "epoch": 18.12982998454405, "grad_norm": 1.4838504791259766, "learning_rate": 1.3210623139243678e-06, "loss": 0.4344, "num_input_tokens_seen": 7897904, "step": 23460 }, { "epoch": 18.133693972179287, "grad_norm": 0.7006383538246155, "learning_rate": 1.315659591773305e-06, "loss": 0.391, "num_input_tokens_seen": 7899376, "step": 23465 }, { "epoch": 18.13755795981453, "grad_norm": 1.6355637311935425, "learning_rate": 1.3102676414020993e-06, "loss": 0.6587, "num_input_tokens_seen": 7900912, "step": 23470 }, { "epoch": 18.141421947449768, "grad_norm": 0.8786815404891968, "learning_rate": 1.304886465263047e-06, "loss": 0.4193, "num_input_tokens_seen": 7902512, "step": 23475 }, { "epoch": 18.145285935085006, "grad_norm": 0.9589652419090271, "learning_rate": 1.2995160658035272e-06, "loss": 0.3724, "num_input_tokens_seen": 7904272, "step": 23480 }, { "epoch": 18.149149922720248, "grad_norm": 1.4912846088409424, "learning_rate": 1.2941564454660438e-06, "loss": 0.4092, "num_input_tokens_seen": 7906224, "step": 23485 }, { "epoch": 18.153013910355487, "grad_norm": 1.108595609664917, "learning_rate": 1.2888076066881778e-06, "loss": 0.4249, "num_input_tokens_seen": 7907824, "step": 23490 }, { "epoch": 18.156877897990725, "grad_norm": 1.2268980741500854, "learning_rate": 1.2834695519026109e-06, "loss": 0.3946, "num_input_tokens_seen": 7909360, "step": 23495 }, { "epoch": 18.160741885625967, "grad_norm": 0.8433042764663696, "learning_rate": 1.2781422835371259e-06, "loss": 0.3899, "num_input_tokens_seen": 7910992, "step": 23500 }, { "epoch": 18.164605873261205, "grad_norm": 1.1534924507141113, "learning_rate": 1.2728258040145907e-06, "loss": 0.6232, "num_input_tokens_seen": 7912880, "step": 23505 }, { "epoch": 18.168469860896444, "grad_norm": 1.0380523204803467, "learning_rate": 1.2675201157529792e-06, "loss": 0.3682, "num_input_tokens_seen": 7914768, "step": 23510 }, { "epoch": 18.172333848531686, "grad_norm": 0.8218289017677307, "learning_rate": 1.2622252211653473e-06, "loss": 0.3887, "num_input_tokens_seen": 7916304, "step": 23515 }, { "epoch": 18.176197836166924, "grad_norm": 0.7408699989318848, "learning_rate": 1.2569411226598438e-06, "loss": 0.3501, "num_input_tokens_seen": 7917968, "step": 23520 }, { "epoch": 18.180061823802163, "grad_norm": 1.0394047498703003, "learning_rate": 1.2516678226397127e-06, "loss": 0.4179, "num_input_tokens_seen": 7919600, "step": 23525 }, { "epoch": 18.183925811437405, "grad_norm": 1.1519496440887451, "learning_rate": 1.2464053235032775e-06, "loss": 0.4957, "num_input_tokens_seen": 7921424, "step": 23530 }, { "epoch": 18.187789799072643, "grad_norm": 1.5464667081832886, "learning_rate": 1.2411536276439567e-06, "loss": 0.3691, "num_input_tokens_seen": 7923120, "step": 23535 }, { "epoch": 18.19165378670788, "grad_norm": 1.7659778594970703, "learning_rate": 1.2359127374502482e-06, "loss": 0.5144, "num_input_tokens_seen": 7924464, "step": 23540 }, { "epoch": 18.195517774343124, "grad_norm": 1.1863738298416138, "learning_rate": 1.2306826553057454e-06, "loss": 0.4135, "num_input_tokens_seen": 7926320, "step": 23545 }, { "epoch": 18.199381761978362, "grad_norm": 0.8843436241149902, "learning_rate": 1.2254633835891205e-06, "loss": 0.4189, "num_input_tokens_seen": 7928240, "step": 23550 }, { "epoch": 18.2032457496136, "grad_norm": 2.686959743499756, "learning_rate": 1.2202549246741302e-06, "loss": 0.4538, "num_input_tokens_seen": 7929968, "step": 23555 }, { "epoch": 18.207109737248842, "grad_norm": 0.8605945110321045, "learning_rate": 1.21505728092961e-06, "loss": 0.4467, "num_input_tokens_seen": 7931664, "step": 23560 }, { "epoch": 18.21097372488408, "grad_norm": 0.6327583193778992, "learning_rate": 1.2098704547194834e-06, "loss": 0.4128, "num_input_tokens_seen": 7933328, "step": 23565 }, { "epoch": 18.21483771251932, "grad_norm": 0.6115022301673889, "learning_rate": 1.2046944484027462e-06, "loss": 0.3291, "num_input_tokens_seen": 7934928, "step": 23570 }, { "epoch": 18.21870170015456, "grad_norm": 1.115966558456421, "learning_rate": 1.1995292643334794e-06, "loss": 0.6315, "num_input_tokens_seen": 7936496, "step": 23575 }, { "epoch": 18.2225656877898, "grad_norm": 0.8502809405326843, "learning_rate": 1.1943749048608343e-06, "loss": 0.4733, "num_input_tokens_seen": 7938224, "step": 23580 }, { "epoch": 18.226429675425038, "grad_norm": 0.8057404160499573, "learning_rate": 1.18923137232905e-06, "loss": 0.5149, "num_input_tokens_seen": 7939792, "step": 23585 }, { "epoch": 18.230293663060277, "grad_norm": 0.7932209968566895, "learning_rate": 1.1840986690774353e-06, "loss": 0.4211, "num_input_tokens_seen": 7941584, "step": 23590 }, { "epoch": 18.23415765069552, "grad_norm": 1.107171893119812, "learning_rate": 1.1789767974403759e-06, "loss": 0.4308, "num_input_tokens_seen": 7943440, "step": 23595 }, { "epoch": 18.238021638330757, "grad_norm": 0.8095762729644775, "learning_rate": 1.173865759747328e-06, "loss": 0.5264, "num_input_tokens_seen": 7945136, "step": 23600 }, { "epoch": 18.241885625965995, "grad_norm": 0.779999852180481, "learning_rate": 1.1687655583228207e-06, "loss": 0.34, "num_input_tokens_seen": 7946832, "step": 23605 }, { "epoch": 18.245749613601237, "grad_norm": 2.207599401473999, "learning_rate": 1.1636761954864573e-06, "loss": 0.4432, "num_input_tokens_seen": 7948688, "step": 23610 }, { "epoch": 18.249613601236476, "grad_norm": 1.3396612405776978, "learning_rate": 1.158597673552908e-06, "loss": 0.5274, "num_input_tokens_seen": 7950384, "step": 23615 }, { "epoch": 18.253477588871714, "grad_norm": 1.3098821640014648, "learning_rate": 1.15352999483192e-06, "loss": 0.3713, "num_input_tokens_seen": 7952016, "step": 23620 }, { "epoch": 18.257341576506956, "grad_norm": 1.5623788833618164, "learning_rate": 1.1484731616282967e-06, "loss": 0.4191, "num_input_tokens_seen": 7953648, "step": 23625 }, { "epoch": 18.261205564142195, "grad_norm": 0.7343312501907349, "learning_rate": 1.1434271762419235e-06, "loss": 0.3704, "num_input_tokens_seen": 7955344, "step": 23630 }, { "epoch": 18.265069551777433, "grad_norm": 1.1389696598052979, "learning_rate": 1.138392040967745e-06, "loss": 0.4404, "num_input_tokens_seen": 7957232, "step": 23635 }, { "epoch": 18.268933539412675, "grad_norm": 0.8001974821090698, "learning_rate": 1.1333677580957657e-06, "loss": 0.3839, "num_input_tokens_seen": 7958960, "step": 23640 }, { "epoch": 18.272797527047913, "grad_norm": 1.4217418432235718, "learning_rate": 1.1283543299110632e-06, "loss": 0.3564, "num_input_tokens_seen": 7960496, "step": 23645 }, { "epoch": 18.276661514683152, "grad_norm": 0.9161412119865417, "learning_rate": 1.1233517586937664e-06, "loss": 0.542, "num_input_tokens_seen": 7962320, "step": 23650 }, { "epoch": 18.280525502318394, "grad_norm": 0.9955741167068481, "learning_rate": 1.1183600467190885e-06, "loss": 0.3437, "num_input_tokens_seen": 7963888, "step": 23655 }, { "epoch": 18.284389489953632, "grad_norm": 1.4832979440689087, "learning_rate": 1.1133791962572805e-06, "loss": 0.3714, "num_input_tokens_seen": 7965744, "step": 23660 }, { "epoch": 18.28825347758887, "grad_norm": 1.2217521667480469, "learning_rate": 1.1084092095736659e-06, "loss": 0.4603, "num_input_tokens_seen": 7967312, "step": 23665 }, { "epoch": 18.292117465224113, "grad_norm": 0.6660898327827454, "learning_rate": 1.103450088928623e-06, "loss": 0.4785, "num_input_tokens_seen": 7968976, "step": 23670 }, { "epoch": 18.29598145285935, "grad_norm": 1.126656413078308, "learning_rate": 1.0985018365775922e-06, "loss": 0.4127, "num_input_tokens_seen": 7970704, "step": 23675 }, { "epoch": 18.29984544049459, "grad_norm": 0.8599227666854858, "learning_rate": 1.093564454771065e-06, "loss": 0.3831, "num_input_tokens_seen": 7972272, "step": 23680 }, { "epoch": 18.30370942812983, "grad_norm": 1.47013258934021, "learning_rate": 1.088637945754592e-06, "loss": 0.3936, "num_input_tokens_seen": 7973968, "step": 23685 }, { "epoch": 18.30757341576507, "grad_norm": 1.1917749643325806, "learning_rate": 1.0837223117687839e-06, "loss": 0.472, "num_input_tokens_seen": 7975504, "step": 23690 }, { "epoch": 18.31143740340031, "grad_norm": 1.0707679986953735, "learning_rate": 1.0788175550492969e-06, "loss": 0.4415, "num_input_tokens_seen": 7976944, "step": 23695 }, { "epoch": 18.315301391035547, "grad_norm": 0.8249748945236206, "learning_rate": 1.0739236778268435e-06, "loss": 0.3658, "num_input_tokens_seen": 7978544, "step": 23700 }, { "epoch": 18.31916537867079, "grad_norm": 0.8224197030067444, "learning_rate": 1.0690406823271909e-06, "loss": 0.3868, "num_input_tokens_seen": 7980336, "step": 23705 }, { "epoch": 18.323029366306027, "grad_norm": 0.9911274909973145, "learning_rate": 1.0641685707711486e-06, "loss": 0.4222, "num_input_tokens_seen": 7982000, "step": 23710 }, { "epoch": 18.326893353941266, "grad_norm": 1.3688745498657227, "learning_rate": 1.059307345374591e-06, "loss": 0.464, "num_input_tokens_seen": 7983728, "step": 23715 }, { "epoch": 18.330757341576508, "grad_norm": 1.2294777631759644, "learning_rate": 1.0544570083484223e-06, "loss": 0.3075, "num_input_tokens_seen": 7985424, "step": 23720 }, { "epoch": 18.334621329211746, "grad_norm": 0.7448731064796448, "learning_rate": 1.0496175618986166e-06, "loss": 0.336, "num_input_tokens_seen": 7986864, "step": 23725 }, { "epoch": 18.338485316846985, "grad_norm": 0.7191780209541321, "learning_rate": 1.0447890082261742e-06, "loss": 0.4551, "num_input_tokens_seen": 7988496, "step": 23730 }, { "epoch": 18.342349304482227, "grad_norm": 0.9793856143951416, "learning_rate": 1.0399713495271551e-06, "loss": 0.6108, "num_input_tokens_seen": 7989936, "step": 23735 }, { "epoch": 18.346213292117465, "grad_norm": 0.8784586191177368, "learning_rate": 1.035164587992657e-06, "loss": 0.3543, "num_input_tokens_seen": 7991504, "step": 23740 }, { "epoch": 18.350077279752703, "grad_norm": 0.9373983144760132, "learning_rate": 1.0303687258088223e-06, "loss": 0.3926, "num_input_tokens_seen": 7993168, "step": 23745 }, { "epoch": 18.353941267387945, "grad_norm": 0.8993312120437622, "learning_rate": 1.0255837651568373e-06, "loss": 0.4588, "num_input_tokens_seen": 7994832, "step": 23750 }, { "epoch": 18.357805255023184, "grad_norm": 1.4119179248809814, "learning_rate": 1.0208097082129332e-06, "loss": 0.6181, "num_input_tokens_seen": 7996432, "step": 23755 }, { "epoch": 18.361669242658422, "grad_norm": 1.2078136205673218, "learning_rate": 1.0160465571483812e-06, "loss": 0.4032, "num_input_tokens_seen": 7998448, "step": 23760 }, { "epoch": 18.365533230293664, "grad_norm": 1.1528302431106567, "learning_rate": 1.0112943141294907e-06, "loss": 0.3773, "num_input_tokens_seen": 8000080, "step": 23765 }, { "epoch": 18.369397217928903, "grad_norm": 0.6918260455131531, "learning_rate": 1.00655298131761e-06, "loss": 0.3955, "num_input_tokens_seen": 8001648, "step": 23770 }, { "epoch": 18.37326120556414, "grad_norm": 1.170006513595581, "learning_rate": 1.001822560869123e-06, "loss": 0.4414, "num_input_tokens_seen": 8002992, "step": 23775 }, { "epoch": 18.377125193199383, "grad_norm": 1.1197551488876343, "learning_rate": 9.971030549354554e-07, "loss": 0.3932, "num_input_tokens_seen": 8004912, "step": 23780 }, { "epoch": 18.38098918083462, "grad_norm": 0.9010127186775208, "learning_rate": 9.923944656630628e-07, "loss": 0.2984, "num_input_tokens_seen": 8006384, "step": 23785 }, { "epoch": 18.38485316846986, "grad_norm": 0.6595208048820496, "learning_rate": 9.876967951934435e-07, "loss": 0.4147, "num_input_tokens_seen": 8008464, "step": 23790 }, { "epoch": 18.388717156105102, "grad_norm": 1.420752763748169, "learning_rate": 9.83010045663127e-07, "loss": 0.4414, "num_input_tokens_seen": 8010224, "step": 23795 }, { "epoch": 18.39258114374034, "grad_norm": 1.534024953842163, "learning_rate": 9.783342192036777e-07, "loss": 0.397, "num_input_tokens_seen": 8011856, "step": 23800 }, { "epoch": 18.39644513137558, "grad_norm": 1.2161656618118286, "learning_rate": 9.736693179416834e-07, "loss": 0.4561, "num_input_tokens_seen": 8013392, "step": 23805 }, { "epoch": 18.40030911901082, "grad_norm": 0.5607199668884277, "learning_rate": 9.690153439987692e-07, "loss": 0.4422, "num_input_tokens_seen": 8014832, "step": 23810 }, { "epoch": 18.40417310664606, "grad_norm": 1.5077766180038452, "learning_rate": 9.643722994915949e-07, "loss": 0.5275, "num_input_tokens_seen": 8016784, "step": 23815 }, { "epoch": 18.408037094281298, "grad_norm": 1.512234091758728, "learning_rate": 9.597401865318405e-07, "loss": 0.4414, "num_input_tokens_seen": 8018480, "step": 23820 }, { "epoch": 18.41190108191654, "grad_norm": 1.0123252868652344, "learning_rate": 9.55119007226221e-07, "loss": 0.4373, "num_input_tokens_seen": 8020272, "step": 23825 }, { "epoch": 18.415765069551778, "grad_norm": 1.5626167058944702, "learning_rate": 9.505087636764748e-07, "loss": 0.4909, "num_input_tokens_seen": 8021808, "step": 23830 }, { "epoch": 18.419629057187016, "grad_norm": 1.5428948402404785, "learning_rate": 9.459094579793715e-07, "loss": 0.4359, "num_input_tokens_seen": 8023504, "step": 23835 }, { "epoch": 18.423493044822255, "grad_norm": 0.8858530521392822, "learning_rate": 9.413210922267019e-07, "loss": 0.3148, "num_input_tokens_seen": 8024944, "step": 23840 }, { "epoch": 18.427357032457497, "grad_norm": 1.0187532901763916, "learning_rate": 9.367436685052828e-07, "loss": 0.4037, "num_input_tokens_seen": 8026640, "step": 23845 }, { "epoch": 18.431221020092735, "grad_norm": 0.6647658944129944, "learning_rate": 9.321771888969488e-07, "loss": 0.647, "num_input_tokens_seen": 8028432, "step": 23850 }, { "epoch": 18.435085007727974, "grad_norm": 0.9879227876663208, "learning_rate": 9.276216554785666e-07, "loss": 0.5457, "num_input_tokens_seen": 8030032, "step": 23855 }, { "epoch": 18.438948995363216, "grad_norm": 1.0451109409332275, "learning_rate": 9.230770703220204e-07, "loss": 0.3516, "num_input_tokens_seen": 8031760, "step": 23860 }, { "epoch": 18.442812982998454, "grad_norm": 0.8901644349098206, "learning_rate": 9.185434354942124e-07, "loss": 0.5845, "num_input_tokens_seen": 8033200, "step": 23865 }, { "epoch": 18.446676970633693, "grad_norm": 1.4765410423278809, "learning_rate": 9.140207530570683e-07, "loss": 0.485, "num_input_tokens_seen": 8035120, "step": 23870 }, { "epoch": 18.450540958268935, "grad_norm": 1.1796610355377197, "learning_rate": 9.095090250675315e-07, "loss": 0.5621, "num_input_tokens_seen": 8036848, "step": 23875 }, { "epoch": 18.454404945904173, "grad_norm": 0.6287869215011597, "learning_rate": 9.050082535775634e-07, "loss": 0.3748, "num_input_tokens_seen": 8038576, "step": 23880 }, { "epoch": 18.45826893353941, "grad_norm": 1.2677565813064575, "learning_rate": 9.005184406341405e-07, "loss": 0.5613, "num_input_tokens_seen": 8040176, "step": 23885 }, { "epoch": 18.462132921174653, "grad_norm": 0.840693473815918, "learning_rate": 8.960395882792544e-07, "loss": 0.4587, "num_input_tokens_seen": 8041936, "step": 23890 }, { "epoch": 18.465996908809892, "grad_norm": 0.9702696204185486, "learning_rate": 8.9157169854992e-07, "loss": 0.4037, "num_input_tokens_seen": 8043632, "step": 23895 }, { "epoch": 18.46986089644513, "grad_norm": 1.068512201309204, "learning_rate": 8.871147734781538e-07, "loss": 0.417, "num_input_tokens_seen": 8045456, "step": 23900 }, { "epoch": 18.473724884080372, "grad_norm": 1.4177625179290771, "learning_rate": 8.826688150909979e-07, "loss": 0.4453, "num_input_tokens_seen": 8047216, "step": 23905 }, { "epoch": 18.47758887171561, "grad_norm": 0.8251640796661377, "learning_rate": 8.782338254104932e-07, "loss": 0.4066, "num_input_tokens_seen": 8048944, "step": 23910 }, { "epoch": 18.48145285935085, "grad_norm": 1.2504870891571045, "learning_rate": 8.738098064537098e-07, "loss": 0.6194, "num_input_tokens_seen": 8050512, "step": 23915 }, { "epoch": 18.48531684698609, "grad_norm": 0.8760752081871033, "learning_rate": 8.693967602327102e-07, "loss": 0.4066, "num_input_tokens_seen": 8052304, "step": 23920 }, { "epoch": 18.48918083462133, "grad_norm": 1.203676462173462, "learning_rate": 8.649946887545751e-07, "loss": 0.5211, "num_input_tokens_seen": 8054032, "step": 23925 }, { "epoch": 18.493044822256568, "grad_norm": 1.3263477087020874, "learning_rate": 8.606035940213974e-07, "loss": 0.7165, "num_input_tokens_seen": 8055568, "step": 23930 }, { "epoch": 18.49690880989181, "grad_norm": 1.2578572034835815, "learning_rate": 8.562234780302686e-07, "loss": 0.4347, "num_input_tokens_seen": 8057136, "step": 23935 }, { "epoch": 18.50077279752705, "grad_norm": 1.052038311958313, "learning_rate": 8.51854342773295e-07, "loss": 0.4223, "num_input_tokens_seen": 8058800, "step": 23940 }, { "epoch": 18.504636785162287, "grad_norm": 1.0208797454833984, "learning_rate": 8.474961902375816e-07, "loss": 0.3927, "num_input_tokens_seen": 8060528, "step": 23945 }, { "epoch": 18.508500772797525, "grad_norm": 1.0984001159667969, "learning_rate": 8.431490224052457e-07, "loss": 0.5101, "num_input_tokens_seen": 8062416, "step": 23950 }, { "epoch": 18.512364760432767, "grad_norm": 0.7867918014526367, "learning_rate": 8.388128412534029e-07, "loss": 0.4195, "num_input_tokens_seen": 8064272, "step": 23955 }, { "epoch": 18.516228748068006, "grad_norm": 0.8601576089859009, "learning_rate": 8.344876487541759e-07, "loss": 0.4631, "num_input_tokens_seen": 8066096, "step": 23960 }, { "epoch": 18.520092735703244, "grad_norm": 1.3364818096160889, "learning_rate": 8.30173446874688e-07, "loss": 0.3719, "num_input_tokens_seen": 8067600, "step": 23965 }, { "epoch": 18.523956723338486, "grad_norm": 0.7983106374740601, "learning_rate": 8.258702375770644e-07, "loss": 0.3636, "num_input_tokens_seen": 8069488, "step": 23970 }, { "epoch": 18.527820710973725, "grad_norm": 1.3929957151412964, "learning_rate": 8.215780228184311e-07, "loss": 0.542, "num_input_tokens_seen": 8071024, "step": 23975 }, { "epoch": 18.531684698608963, "grad_norm": 0.7866128087043762, "learning_rate": 8.172968045509127e-07, "loss": 0.4253, "num_input_tokens_seen": 8072528, "step": 23980 }, { "epoch": 18.535548686244205, "grad_norm": 0.7135565876960754, "learning_rate": 8.130265847216295e-07, "loss": 0.3878, "num_input_tokens_seen": 8074000, "step": 23985 }, { "epoch": 18.539412673879443, "grad_norm": 1.1763540506362915, "learning_rate": 8.087673652727057e-07, "loss": 0.3912, "num_input_tokens_seen": 8075632, "step": 23990 }, { "epoch": 18.543276661514682, "grad_norm": 1.3799939155578613, "learning_rate": 8.045191481412584e-07, "loss": 0.8119, "num_input_tokens_seen": 8077584, "step": 23995 }, { "epoch": 18.547140649149924, "grad_norm": 0.7365400195121765, "learning_rate": 8.00281935259406e-07, "loss": 0.442, "num_input_tokens_seen": 8079280, "step": 24000 }, { "epoch": 18.551004636785162, "grad_norm": 0.7008447647094727, "learning_rate": 7.960557285542569e-07, "loss": 0.3897, "num_input_tokens_seen": 8080848, "step": 24005 }, { "epoch": 18.5548686244204, "grad_norm": 1.0229395627975464, "learning_rate": 7.918405299479126e-07, "loss": 0.487, "num_input_tokens_seen": 8082512, "step": 24010 }, { "epoch": 18.558732612055643, "grad_norm": 1.1513320207595825, "learning_rate": 7.876363413574728e-07, "loss": 0.4738, "num_input_tokens_seen": 8084112, "step": 24015 }, { "epoch": 18.56259659969088, "grad_norm": 0.7350494265556335, "learning_rate": 7.834431646950275e-07, "loss": 0.4551, "num_input_tokens_seen": 8085616, "step": 24020 }, { "epoch": 18.56646058732612, "grad_norm": 1.5465031862258911, "learning_rate": 7.792610018676538e-07, "loss": 0.5533, "num_input_tokens_seen": 8087568, "step": 24025 }, { "epoch": 18.57032457496136, "grad_norm": 1.4709877967834473, "learning_rate": 7.750898547774305e-07, "loss": 0.529, "num_input_tokens_seen": 8089296, "step": 24030 }, { "epoch": 18.5741885625966, "grad_norm": 0.8008494973182678, "learning_rate": 7.709297253214231e-07, "loss": 0.444, "num_input_tokens_seen": 8091120, "step": 24035 }, { "epoch": 18.57805255023184, "grad_norm": 1.1131807565689087, "learning_rate": 7.667806153916768e-07, "loss": 0.4748, "num_input_tokens_seen": 8092624, "step": 24040 }, { "epoch": 18.58191653786708, "grad_norm": 1.3011281490325928, "learning_rate": 7.626425268752318e-07, "loss": 0.5292, "num_input_tokens_seen": 8094224, "step": 24045 }, { "epoch": 18.58578052550232, "grad_norm": 0.7557945251464844, "learning_rate": 7.585154616541191e-07, "loss": 0.3579, "num_input_tokens_seen": 8095920, "step": 24050 }, { "epoch": 18.589644513137557, "grad_norm": 2.0399179458618164, "learning_rate": 7.543994216053535e-07, "loss": 0.4261, "num_input_tokens_seen": 8097648, "step": 24055 }, { "epoch": 18.5935085007728, "grad_norm": 1.2018426656723022, "learning_rate": 7.502944086009267e-07, "loss": 0.4261, "num_input_tokens_seen": 8099440, "step": 24060 }, { "epoch": 18.597372488408038, "grad_norm": 0.6531675457954407, "learning_rate": 7.462004245078313e-07, "loss": 0.4, "num_input_tokens_seen": 8101232, "step": 24065 }, { "epoch": 18.601236476043276, "grad_norm": 0.5934195518493652, "learning_rate": 7.421174711880307e-07, "loss": 0.4011, "num_input_tokens_seen": 8102640, "step": 24070 }, { "epoch": 18.605100463678518, "grad_norm": 0.9645745754241943, "learning_rate": 7.380455504984812e-07, "loss": 0.3463, "num_input_tokens_seen": 8104208, "step": 24075 }, { "epoch": 18.608964451313756, "grad_norm": 0.9416379332542419, "learning_rate": 7.339846642911152e-07, "loss": 0.3861, "num_input_tokens_seen": 8105840, "step": 24080 }, { "epoch": 18.612828438948995, "grad_norm": 0.9655539989471436, "learning_rate": 7.299348144128471e-07, "loss": 0.5316, "num_input_tokens_seen": 8107344, "step": 24085 }, { "epoch": 18.616692426584233, "grad_norm": 0.9898302555084229, "learning_rate": 7.258960027055756e-07, "loss": 0.3686, "num_input_tokens_seen": 8108976, "step": 24090 }, { "epoch": 18.620556414219475, "grad_norm": 1.0500863790512085, "learning_rate": 7.218682310061675e-07, "loss": 0.4125, "num_input_tokens_seen": 8110704, "step": 24095 }, { "epoch": 18.624420401854714, "grad_norm": 0.7266284227371216, "learning_rate": 7.178515011464882e-07, "loss": 0.6174, "num_input_tokens_seen": 8112304, "step": 24100 }, { "epoch": 18.628284389489952, "grad_norm": 0.655013382434845, "learning_rate": 7.138458149533678e-07, "loss": 0.3568, "num_input_tokens_seen": 8114064, "step": 24105 }, { "epoch": 18.632148377125194, "grad_norm": 0.7117183804512024, "learning_rate": 7.098511742486103e-07, "loss": 0.3165, "num_input_tokens_seen": 8115824, "step": 24110 }, { "epoch": 18.636012364760433, "grad_norm": 1.0571949481964111, "learning_rate": 7.058675808490095e-07, "loss": 0.5966, "num_input_tokens_seen": 8117776, "step": 24115 }, { "epoch": 18.63987635239567, "grad_norm": 1.0401256084442139, "learning_rate": 7.018950365663246e-07, "loss": 0.6041, "num_input_tokens_seen": 8119632, "step": 24120 }, { "epoch": 18.643740340030913, "grad_norm": 0.8844929933547974, "learning_rate": 6.979335432072937e-07, "loss": 0.5079, "num_input_tokens_seen": 8121200, "step": 24125 }, { "epoch": 18.64760432766615, "grad_norm": 0.549325704574585, "learning_rate": 6.939831025736226e-07, "loss": 0.3381, "num_input_tokens_seen": 8122896, "step": 24130 }, { "epoch": 18.65146831530139, "grad_norm": 1.0661616325378418, "learning_rate": 6.900437164620022e-07, "loss": 0.6356, "num_input_tokens_seen": 8124496, "step": 24135 }, { "epoch": 18.655332302936632, "grad_norm": 0.8133041262626648, "learning_rate": 6.861153866640879e-07, "loss": 0.3762, "num_input_tokens_seen": 8126096, "step": 24140 }, { "epoch": 18.65919629057187, "grad_norm": 1.1382725238800049, "learning_rate": 6.821981149665064e-07, "loss": 0.401, "num_input_tokens_seen": 8127824, "step": 24145 }, { "epoch": 18.66306027820711, "grad_norm": 1.0337485074996948, "learning_rate": 6.782919031508517e-07, "loss": 0.6722, "num_input_tokens_seen": 8129520, "step": 24150 }, { "epoch": 18.66692426584235, "grad_norm": 0.7866945266723633, "learning_rate": 6.743967529936974e-07, "loss": 0.5233, "num_input_tokens_seen": 8131216, "step": 24155 }, { "epoch": 18.67078825347759, "grad_norm": 0.9021016955375671, "learning_rate": 6.705126662665817e-07, "loss": 0.4924, "num_input_tokens_seen": 8132880, "step": 24160 }, { "epoch": 18.674652241112828, "grad_norm": 1.4840046167373657, "learning_rate": 6.666396447360084e-07, "loss": 0.3785, "num_input_tokens_seen": 8134704, "step": 24165 }, { "epoch": 18.67851622874807, "grad_norm": 1.0911343097686768, "learning_rate": 6.627776901634519e-07, "loss": 0.4131, "num_input_tokens_seen": 8136400, "step": 24170 }, { "epoch": 18.682380216383308, "grad_norm": 1.3219659328460693, "learning_rate": 6.589268043053514e-07, "loss": 0.3866, "num_input_tokens_seen": 8137968, "step": 24175 }, { "epoch": 18.686244204018546, "grad_norm": 1.0471786260604858, "learning_rate": 6.550869889131144e-07, "loss": 0.4714, "num_input_tokens_seen": 8139728, "step": 24180 }, { "epoch": 18.69010819165379, "grad_norm": 1.00910484790802, "learning_rate": 6.512582457331107e-07, "loss": 0.4943, "num_input_tokens_seen": 8141232, "step": 24185 }, { "epoch": 18.693972179289027, "grad_norm": 1.282530426979065, "learning_rate": 6.474405765066721e-07, "loss": 0.4643, "num_input_tokens_seen": 8142896, "step": 24190 }, { "epoch": 18.697836166924265, "grad_norm": 1.079357624053955, "learning_rate": 6.436339829701044e-07, "loss": 0.6917, "num_input_tokens_seen": 8144368, "step": 24195 }, { "epoch": 18.701700154559504, "grad_norm": 1.284315586090088, "learning_rate": 6.398384668546669e-07, "loss": 0.3972, "num_input_tokens_seen": 8146064, "step": 24200 }, { "epoch": 18.705564142194746, "grad_norm": 0.879381537437439, "learning_rate": 6.360540298865764e-07, "loss": 0.3945, "num_input_tokens_seen": 8147728, "step": 24205 }, { "epoch": 18.709428129829984, "grad_norm": 0.9723437428474426, "learning_rate": 6.322806737870279e-07, "loss": 0.4347, "num_input_tokens_seen": 8149296, "step": 24210 }, { "epoch": 18.713292117465222, "grad_norm": 1.1128664016723633, "learning_rate": 6.285184002721628e-07, "loss": 0.4262, "num_input_tokens_seen": 8150928, "step": 24215 }, { "epoch": 18.717156105100464, "grad_norm": 1.3989845514297485, "learning_rate": 6.247672110530816e-07, "loss": 0.4426, "num_input_tokens_seen": 8152624, "step": 24220 }, { "epoch": 18.721020092735703, "grad_norm": 1.2452332973480225, "learning_rate": 6.210271078358503e-07, "loss": 0.4308, "num_input_tokens_seen": 8154416, "step": 24225 }, { "epoch": 18.72488408037094, "grad_norm": 1.573270320892334, "learning_rate": 6.172980923214889e-07, "loss": 0.4531, "num_input_tokens_seen": 8156208, "step": 24230 }, { "epoch": 18.728748068006183, "grad_norm": 0.8053200244903564, "learning_rate": 6.13580166205982e-07, "loss": 0.5621, "num_input_tokens_seen": 8158000, "step": 24235 }, { "epoch": 18.73261205564142, "grad_norm": 1.1114832162857056, "learning_rate": 6.098733311802552e-07, "loss": 0.3918, "num_input_tokens_seen": 8159600, "step": 24240 }, { "epoch": 18.73647604327666, "grad_norm": 0.6806671023368835, "learning_rate": 6.061775889302068e-07, "loss": 0.4331, "num_input_tokens_seen": 8161200, "step": 24245 }, { "epoch": 18.740340030911902, "grad_norm": 2.3795619010925293, "learning_rate": 6.024929411366787e-07, "loss": 0.4126, "num_input_tokens_seen": 8163088, "step": 24250 }, { "epoch": 18.74420401854714, "grad_norm": 0.7036269903182983, "learning_rate": 5.988193894754746e-07, "loss": 0.4774, "num_input_tokens_seen": 8164688, "step": 24255 }, { "epoch": 18.74806800618238, "grad_norm": 0.9785423874855042, "learning_rate": 5.951569356173414e-07, "loss": 0.4085, "num_input_tokens_seen": 8166160, "step": 24260 }, { "epoch": 18.75193199381762, "grad_norm": 0.7650579810142517, "learning_rate": 5.915055812279913e-07, "loss": 0.4212, "num_input_tokens_seen": 8167664, "step": 24265 }, { "epoch": 18.75579598145286, "grad_norm": 0.9768291711807251, "learning_rate": 5.878653279680762e-07, "loss": 0.4188, "num_input_tokens_seen": 8169584, "step": 24270 }, { "epoch": 18.759659969088098, "grad_norm": 1.180401086807251, "learning_rate": 5.842361774932109e-07, "loss": 0.5162, "num_input_tokens_seen": 8171280, "step": 24275 }, { "epoch": 18.76352395672334, "grad_norm": 1.0651775598526, "learning_rate": 5.806181314539527e-07, "loss": 0.3576, "num_input_tokens_seen": 8172944, "step": 24280 }, { "epoch": 18.76738794435858, "grad_norm": 1.1880409717559814, "learning_rate": 5.770111914958104e-07, "loss": 0.4707, "num_input_tokens_seen": 8174544, "step": 24285 }, { "epoch": 18.771251931993817, "grad_norm": 0.7994439005851746, "learning_rate": 5.734153592592412e-07, "loss": 0.4427, "num_input_tokens_seen": 8176304, "step": 24290 }, { "epoch": 18.77511591962906, "grad_norm": 1.3475890159606934, "learning_rate": 5.698306363796535e-07, "loss": 0.3455, "num_input_tokens_seen": 8177840, "step": 24295 }, { "epoch": 18.778979907264297, "grad_norm": 1.1583118438720703, "learning_rate": 5.66257024487396e-07, "loss": 0.443, "num_input_tokens_seen": 8179504, "step": 24300 }, { "epoch": 18.782843894899536, "grad_norm": 1.3419643640518188, "learning_rate": 5.626945252077714e-07, "loss": 0.5656, "num_input_tokens_seen": 8181104, "step": 24305 }, { "epoch": 18.786707882534778, "grad_norm": 1.908998727798462, "learning_rate": 5.591431401610253e-07, "loss": 0.5701, "num_input_tokens_seen": 8182640, "step": 24310 }, { "epoch": 18.790571870170016, "grad_norm": 0.9196651577949524, "learning_rate": 5.556028709623545e-07, "loss": 0.3542, "num_input_tokens_seen": 8184144, "step": 24315 }, { "epoch": 18.794435857805254, "grad_norm": 0.5737335085868835, "learning_rate": 5.520737192218877e-07, "loss": 0.4892, "num_input_tokens_seen": 8185936, "step": 24320 }, { "epoch": 18.798299845440496, "grad_norm": 0.9624753594398499, "learning_rate": 5.48555686544705e-07, "loss": 0.4106, "num_input_tokens_seen": 8187440, "step": 24325 }, { "epoch": 18.802163833075735, "grad_norm": 1.2544842958450317, "learning_rate": 5.450487745308319e-07, "loss": 0.3815, "num_input_tokens_seen": 8189168, "step": 24330 }, { "epoch": 18.806027820710973, "grad_norm": 0.8974002599716187, "learning_rate": 5.415529847752287e-07, "loss": 0.3586, "num_input_tokens_seen": 8190640, "step": 24335 }, { "epoch": 18.80989180834621, "grad_norm": 0.9531793594360352, "learning_rate": 5.380683188678042e-07, "loss": 0.5114, "num_input_tokens_seen": 8192272, "step": 24340 }, { "epoch": 18.813755795981454, "grad_norm": 0.6946759819984436, "learning_rate": 5.345947783934075e-07, "loss": 0.3486, "num_input_tokens_seen": 8193840, "step": 24345 }, { "epoch": 18.817619783616692, "grad_norm": 1.4318842887878418, "learning_rate": 5.311323649318189e-07, "loss": 0.6409, "num_input_tokens_seen": 8195408, "step": 24350 }, { "epoch": 18.82148377125193, "grad_norm": 1.1107232570648193, "learning_rate": 5.276810800577736e-07, "loss": 0.4672, "num_input_tokens_seen": 8197424, "step": 24355 }, { "epoch": 18.825347758887172, "grad_norm": 1.260440707206726, "learning_rate": 5.242409253409297e-07, "loss": 0.3771, "num_input_tokens_seen": 8198928, "step": 24360 }, { "epoch": 18.82921174652241, "grad_norm": 0.9262207746505737, "learning_rate": 5.208119023458941e-07, "loss": 0.381, "num_input_tokens_seen": 8200752, "step": 24365 }, { "epoch": 18.83307573415765, "grad_norm": 1.1990464925765991, "learning_rate": 5.173940126322052e-07, "loss": 0.5609, "num_input_tokens_seen": 8202448, "step": 24370 }, { "epoch": 18.83693972179289, "grad_norm": 1.1547130346298218, "learning_rate": 5.139872577543364e-07, "loss": 0.4163, "num_input_tokens_seen": 8204336, "step": 24375 }, { "epoch": 18.84080370942813, "grad_norm": 1.1802409887313843, "learning_rate": 5.105916392617066e-07, "loss": 0.4278, "num_input_tokens_seen": 8206000, "step": 24380 }, { "epoch": 18.844667697063368, "grad_norm": 0.6434523463249207, "learning_rate": 5.07207158698661e-07, "loss": 0.3561, "num_input_tokens_seen": 8207504, "step": 24385 }, { "epoch": 18.84853168469861, "grad_norm": 0.9527152180671692, "learning_rate": 5.038338176044794e-07, "loss": 0.4173, "num_input_tokens_seen": 8209008, "step": 24390 }, { "epoch": 18.85239567233385, "grad_norm": 1.0609136819839478, "learning_rate": 5.004716175133817e-07, "loss": 0.5019, "num_input_tokens_seen": 8210512, "step": 24395 }, { "epoch": 18.856259659969087, "grad_norm": 1.198307752609253, "learning_rate": 4.971205599545115e-07, "loss": 0.4058, "num_input_tokens_seen": 8212048, "step": 24400 }, { "epoch": 18.86012364760433, "grad_norm": 0.790107250213623, "learning_rate": 4.937806464519551e-07, "loss": 0.39, "num_input_tokens_seen": 8214064, "step": 24405 }, { "epoch": 18.863987635239567, "grad_norm": 1.9996709823608398, "learning_rate": 4.904518785247225e-07, "loss": 0.4923, "num_input_tokens_seen": 8215856, "step": 24410 }, { "epoch": 18.867851622874806, "grad_norm": 1.7019460201263428, "learning_rate": 4.871342576867555e-07, "loss": 0.5694, "num_input_tokens_seen": 8217680, "step": 24415 }, { "epoch": 18.871715610510048, "grad_norm": 0.9916306734085083, "learning_rate": 4.83827785446933e-07, "loss": 0.382, "num_input_tokens_seen": 8219536, "step": 24420 }, { "epoch": 18.875579598145286, "grad_norm": 1.3273305892944336, "learning_rate": 4.805324633090525e-07, "loss": 0.3391, "num_input_tokens_seen": 8221328, "step": 24425 }, { "epoch": 18.879443585780525, "grad_norm": 0.8905718326568604, "learning_rate": 4.77248292771848e-07, "loss": 0.4052, "num_input_tokens_seen": 8222736, "step": 24430 }, { "epoch": 18.883307573415767, "grad_norm": 1.4674736261367798, "learning_rate": 4.7397527532898333e-07, "loss": 0.4006, "num_input_tokens_seen": 8224496, "step": 24435 }, { "epoch": 18.887171561051005, "grad_norm": 1.1270586252212524, "learning_rate": 4.7071341246904545e-07, "loss": 0.6591, "num_input_tokens_seen": 8226256, "step": 24440 }, { "epoch": 18.891035548686244, "grad_norm": 0.6782748103141785, "learning_rate": 4.674627056755448e-07, "loss": 0.4733, "num_input_tokens_seen": 8227888, "step": 24445 }, { "epoch": 18.894899536321482, "grad_norm": 1.2156354188919067, "learning_rate": 4.642231564269267e-07, "loss": 0.3587, "num_input_tokens_seen": 8229776, "step": 24450 }, { "epoch": 18.898763523956724, "grad_norm": 1.7407722473144531, "learning_rate": 4.609947661965569e-07, "loss": 0.4093, "num_input_tokens_seen": 8231440, "step": 24455 }, { "epoch": 18.902627511591962, "grad_norm": 1.251694917678833, "learning_rate": 4.577775364527248e-07, "loss": 0.4955, "num_input_tokens_seen": 8233360, "step": 24460 }, { "epoch": 18.9064914992272, "grad_norm": 0.6688297986984253, "learning_rate": 4.545714686586461e-07, "loss": 0.3956, "num_input_tokens_seen": 8235024, "step": 24465 }, { "epoch": 18.910355486862443, "grad_norm": 0.8802443742752075, "learning_rate": 4.513765642724599e-07, "loss": 0.4327, "num_input_tokens_seen": 8236976, "step": 24470 }, { "epoch": 18.91421947449768, "grad_norm": 1.0946635007858276, "learning_rate": 4.4819282474722893e-07, "loss": 0.477, "num_input_tokens_seen": 8238768, "step": 24475 }, { "epoch": 18.91808346213292, "grad_norm": 0.8943836688995361, "learning_rate": 4.4502025153093376e-07, "loss": 0.569, "num_input_tokens_seen": 8240624, "step": 24480 }, { "epoch": 18.92194744976816, "grad_norm": 0.8405798077583313, "learning_rate": 4.4185884606648686e-07, "loss": 0.4827, "num_input_tokens_seen": 8242448, "step": 24485 }, { "epoch": 18.9258114374034, "grad_norm": 0.818545401096344, "learning_rate": 4.387086097917076e-07, "loss": 0.4286, "num_input_tokens_seen": 8244144, "step": 24490 }, { "epoch": 18.92967542503864, "grad_norm": 0.6189785599708557, "learning_rate": 4.3556954413934424e-07, "loss": 0.3952, "num_input_tokens_seen": 8246224, "step": 24495 }, { "epoch": 18.93353941267388, "grad_norm": 0.978488028049469, "learning_rate": 4.3244165053706323e-07, "loss": 0.366, "num_input_tokens_seen": 8247728, "step": 24500 }, { "epoch": 18.93740340030912, "grad_norm": 1.0306841135025024, "learning_rate": 4.293249304074487e-07, "loss": 0.8404, "num_input_tokens_seen": 8249360, "step": 24505 }, { "epoch": 18.941267387944357, "grad_norm": 0.5798084735870361, "learning_rate": 4.2621938516800296e-07, "loss": 0.386, "num_input_tokens_seen": 8250832, "step": 24510 }, { "epoch": 18.9451313755796, "grad_norm": 0.615127444267273, "learning_rate": 4.231250162311462e-07, "loss": 0.5598, "num_input_tokens_seen": 8252432, "step": 24515 }, { "epoch": 18.948995363214838, "grad_norm": 0.6434810757637024, "learning_rate": 4.200418250042193e-07, "loss": 0.57, "num_input_tokens_seen": 8253936, "step": 24520 }, { "epoch": 18.952859350850076, "grad_norm": 0.9211252331733704, "learning_rate": 4.1696981288947556e-07, "loss": 0.4588, "num_input_tokens_seen": 8255408, "step": 24525 }, { "epoch": 18.956723338485318, "grad_norm": 1.1474330425262451, "learning_rate": 4.1390898128408076e-07, "loss": 0.434, "num_input_tokens_seen": 8257136, "step": 24530 }, { "epoch": 18.960587326120557, "grad_norm": 0.9258624911308289, "learning_rate": 4.10859331580124e-07, "loss": 0.4025, "num_input_tokens_seen": 8258832, "step": 24535 }, { "epoch": 18.964451313755795, "grad_norm": 2.569949150085449, "learning_rate": 4.078208651645987e-07, "loss": 0.7417, "num_input_tokens_seen": 8260400, "step": 24540 }, { "epoch": 18.968315301391037, "grad_norm": 1.0449764728546143, "learning_rate": 4.0479358341942164e-07, "loss": 0.448, "num_input_tokens_seen": 8261904, "step": 24545 }, { "epoch": 18.972179289026275, "grad_norm": 1.4372541904449463, "learning_rate": 4.0177748772141646e-07, "loss": 0.4003, "num_input_tokens_seen": 8263408, "step": 24550 }, { "epoch": 18.976043276661514, "grad_norm": 0.7178856730461121, "learning_rate": 3.9877257944232474e-07, "loss": 0.538, "num_input_tokens_seen": 8265008, "step": 24555 }, { "epoch": 18.979907264296756, "grad_norm": 1.1256009340286255, "learning_rate": 3.957788599487949e-07, "loss": 0.7159, "num_input_tokens_seen": 8266960, "step": 24560 }, { "epoch": 18.983771251931994, "grad_norm": 0.7040135264396667, "learning_rate": 3.9279633060238797e-07, "loss": 0.39, "num_input_tokens_seen": 8268752, "step": 24565 }, { "epoch": 18.987635239567233, "grad_norm": 0.7466974854469299, "learning_rate": 3.8982499275957704e-07, "loss": 0.3985, "num_input_tokens_seen": 8270544, "step": 24570 }, { "epoch": 18.991499227202475, "grad_norm": 0.7317586541175842, "learning_rate": 3.8686484777174513e-07, "loss": 0.5168, "num_input_tokens_seen": 8271952, "step": 24575 }, { "epoch": 18.995363214837713, "grad_norm": 0.8098863363265991, "learning_rate": 3.8391589698517915e-07, "loss": 0.3917, "num_input_tokens_seen": 8273904, "step": 24580 }, { "epoch": 18.99922720247295, "grad_norm": 1.2888290882110596, "learning_rate": 3.809781417410868e-07, "loss": 0.3964, "num_input_tokens_seen": 8275984, "step": 24585 }, { "epoch": 19.0, "eval_loss": 0.46278631687164307, "eval_runtime": 6.3711, "eval_samples_per_second": 90.252, "eval_steps_per_second": 22.602, "num_input_tokens_seen": 8276160, "step": 24586 }, { "epoch": 19.00309119010819, "grad_norm": 0.9315847754478455, "learning_rate": 3.7805158337557155e-07, "loss": 0.4162, "num_input_tokens_seen": 8277504, "step": 24590 }, { "epoch": 19.006955177743432, "grad_norm": 1.5664379596710205, "learning_rate": 3.7513622321964927e-07, "loss": 0.4337, "num_input_tokens_seen": 8279040, "step": 24595 }, { "epoch": 19.01081916537867, "grad_norm": 0.793600857257843, "learning_rate": 3.7223206259924813e-07, "loss": 0.679, "num_input_tokens_seen": 8280672, "step": 24600 }, { "epoch": 19.01468315301391, "grad_norm": 1.06793212890625, "learning_rate": 3.69339102835195e-07, "loss": 0.3778, "num_input_tokens_seen": 8282272, "step": 24605 }, { "epoch": 19.01854714064915, "grad_norm": 0.6456027030944824, "learning_rate": 3.664573452432235e-07, "loss": 0.4528, "num_input_tokens_seen": 8284032, "step": 24610 }, { "epoch": 19.02241112828439, "grad_norm": 0.7163226008415222, "learning_rate": 3.635867911339741e-07, "loss": 0.4424, "num_input_tokens_seen": 8285632, "step": 24615 }, { "epoch": 19.026275115919628, "grad_norm": 0.7584263682365417, "learning_rate": 3.607274418129969e-07, "loss": 0.4535, "num_input_tokens_seen": 8287264, "step": 24620 }, { "epoch": 19.03013910355487, "grad_norm": 0.6902530789375305, "learning_rate": 3.5787929858073777e-07, "loss": 0.4138, "num_input_tokens_seen": 8289088, "step": 24625 }, { "epoch": 19.034003091190108, "grad_norm": 0.8390199542045593, "learning_rate": 3.5504236273254943e-07, "loss": 0.4326, "num_input_tokens_seen": 8290752, "step": 24630 }, { "epoch": 19.037867078825347, "grad_norm": 0.7559042572975159, "learning_rate": 3.5221663555868587e-07, "loss": 0.3615, "num_input_tokens_seen": 8292608, "step": 24635 }, { "epoch": 19.04173106646059, "grad_norm": 0.7514026761054993, "learning_rate": 3.4940211834430804e-07, "loss": 0.4007, "num_input_tokens_seen": 8294400, "step": 24640 }, { "epoch": 19.045595054095827, "grad_norm": 1.1455904245376587, "learning_rate": 3.4659881236947246e-07, "loss": 0.4752, "num_input_tokens_seen": 8295936, "step": 24645 }, { "epoch": 19.049459041731065, "grad_norm": 0.9055631160736084, "learning_rate": 3.4380671890913985e-07, "loss": 0.4121, "num_input_tokens_seen": 8297408, "step": 24650 }, { "epoch": 19.053323029366307, "grad_norm": 1.0766969919204712, "learning_rate": 3.410258392331722e-07, "loss": 0.4096, "num_input_tokens_seen": 8299232, "step": 24655 }, { "epoch": 19.057187017001546, "grad_norm": 1.3754698038101196, "learning_rate": 3.3825617460633006e-07, "loss": 0.3921, "num_input_tokens_seen": 8300928, "step": 24660 }, { "epoch": 19.061051004636784, "grad_norm": 1.333513617515564, "learning_rate": 3.3549772628827524e-07, "loss": 0.3859, "num_input_tokens_seen": 8302720, "step": 24665 }, { "epoch": 19.064914992272026, "grad_norm": 2.123677968978882, "learning_rate": 3.327504955335625e-07, "loss": 0.4953, "num_input_tokens_seen": 8304256, "step": 24670 }, { "epoch": 19.068778979907265, "grad_norm": 1.1777070760726929, "learning_rate": 3.30014483591648e-07, "loss": 0.3973, "num_input_tokens_seen": 8306080, "step": 24675 }, { "epoch": 19.072642967542503, "grad_norm": 0.852579653263092, "learning_rate": 3.2728969170689183e-07, "loss": 0.6958, "num_input_tokens_seen": 8307648, "step": 24680 }, { "epoch": 19.076506955177745, "grad_norm": 0.877249002456665, "learning_rate": 3.2457612111854165e-07, "loss": 0.3183, "num_input_tokens_seen": 8309152, "step": 24685 }, { "epoch": 19.080370942812984, "grad_norm": 1.4673690795898438, "learning_rate": 3.218737730607491e-07, "loss": 0.6381, "num_input_tokens_seen": 8310816, "step": 24690 }, { "epoch": 19.084234930448222, "grad_norm": 1.0026135444641113, "learning_rate": 3.191826487625532e-07, "loss": 0.5053, "num_input_tokens_seen": 8312480, "step": 24695 }, { "epoch": 19.08809891808346, "grad_norm": 0.7842795848846436, "learning_rate": 3.1650274944790004e-07, "loss": 0.3634, "num_input_tokens_seen": 8314336, "step": 24700 }, { "epoch": 19.091962905718702, "grad_norm": 0.7692320346832275, "learning_rate": 3.1383407633561734e-07, "loss": 0.3641, "num_input_tokens_seen": 8315872, "step": 24705 }, { "epoch": 19.09582689335394, "grad_norm": 1.2534396648406982, "learning_rate": 3.1117663063943705e-07, "loss": 0.4855, "num_input_tokens_seen": 8317600, "step": 24710 }, { "epoch": 19.09969088098918, "grad_norm": 1.3536690473556519, "learning_rate": 3.0853041356798116e-07, "loss": 0.5446, "num_input_tokens_seen": 8319136, "step": 24715 }, { "epoch": 19.10355486862442, "grad_norm": 0.8888530731201172, "learning_rate": 3.058954263247621e-07, "loss": 0.4046, "num_input_tokens_seen": 8320960, "step": 24720 }, { "epoch": 19.10741885625966, "grad_norm": 1.0368974208831787, "learning_rate": 3.0327167010819333e-07, "loss": 0.3867, "num_input_tokens_seen": 8322688, "step": 24725 }, { "epoch": 19.111282843894898, "grad_norm": 1.413233995437622, "learning_rate": 3.006591461115704e-07, "loss": 0.3743, "num_input_tokens_seen": 8324352, "step": 24730 }, { "epoch": 19.11514683153014, "grad_norm": 0.9857497811317444, "learning_rate": 2.9805785552308727e-07, "loss": 0.453, "num_input_tokens_seen": 8326112, "step": 24735 }, { "epoch": 19.11901081916538, "grad_norm": 0.7772648334503174, "learning_rate": 2.954677995258254e-07, "loss": 0.3543, "num_input_tokens_seen": 8327808, "step": 24740 }, { "epoch": 19.122874806800617, "grad_norm": 1.6274937391281128, "learning_rate": 2.9288897929775905e-07, "loss": 0.4145, "num_input_tokens_seen": 8329376, "step": 24745 }, { "epoch": 19.12673879443586, "grad_norm": 0.7542877197265625, "learning_rate": 2.9032139601174734e-07, "loss": 0.4542, "num_input_tokens_seen": 8331008, "step": 24750 }, { "epoch": 19.130602782071097, "grad_norm": 0.954581618309021, "learning_rate": 2.8776505083554504e-07, "loss": 0.3803, "num_input_tokens_seen": 8332544, "step": 24755 }, { "epoch": 19.134466769706336, "grad_norm": 0.956078052520752, "learning_rate": 2.852199449317944e-07, "loss": 0.5931, "num_input_tokens_seen": 8334208, "step": 24760 }, { "epoch": 19.138330757341578, "grad_norm": 1.2389456033706665, "learning_rate": 2.8268607945802493e-07, "loss": 0.4854, "num_input_tokens_seen": 8335936, "step": 24765 }, { "epoch": 19.142194744976816, "grad_norm": 0.9497150182723999, "learning_rate": 2.801634555666538e-07, "loss": 0.3716, "num_input_tokens_seen": 8337600, "step": 24770 }, { "epoch": 19.146058732612055, "grad_norm": 0.9107908606529236, "learning_rate": 2.7765207440498266e-07, "loss": 0.3587, "num_input_tokens_seen": 8339360, "step": 24775 }, { "epoch": 19.149922720247297, "grad_norm": 1.0305821895599365, "learning_rate": 2.751519371152034e-07, "loss": 0.3791, "num_input_tokens_seen": 8340896, "step": 24780 }, { "epoch": 19.153786707882535, "grad_norm": 1.7468706369400024, "learning_rate": 2.726630448343953e-07, "loss": 0.4359, "num_input_tokens_seen": 8342464, "step": 24785 }, { "epoch": 19.157650695517773, "grad_norm": 0.7992207407951355, "learning_rate": 2.7018539869451963e-07, "loss": 0.3954, "num_input_tokens_seen": 8344352, "step": 24790 }, { "epoch": 19.161514683153015, "grad_norm": 1.227874755859375, "learning_rate": 2.6771899982242774e-07, "loss": 0.4153, "num_input_tokens_seen": 8346048, "step": 24795 }, { "epoch": 19.165378670788254, "grad_norm": 0.7609086036682129, "learning_rate": 2.6526384933984737e-07, "loss": 0.3743, "num_input_tokens_seen": 8347584, "step": 24800 }, { "epoch": 19.169242658423492, "grad_norm": 0.7925766110420227, "learning_rate": 2.6281994836340195e-07, "loss": 0.3369, "num_input_tokens_seen": 8349216, "step": 24805 }, { "epoch": 19.173106646058734, "grad_norm": 1.1427280902862549, "learning_rate": 2.603872980045885e-07, "loss": 0.4283, "num_input_tokens_seen": 8350720, "step": 24810 }, { "epoch": 19.176970633693973, "grad_norm": 1.1500529050827026, "learning_rate": 2.5796589936979423e-07, "loss": 0.3799, "num_input_tokens_seen": 8352320, "step": 24815 }, { "epoch": 19.18083462132921, "grad_norm": 0.7061426639556885, "learning_rate": 2.5555575356027703e-07, "loss": 0.4077, "num_input_tokens_seen": 8354112, "step": 24820 }, { "epoch": 19.18469860896445, "grad_norm": 1.0908372402191162, "learning_rate": 2.531568616721963e-07, "loss": 0.5954, "num_input_tokens_seen": 8355968, "step": 24825 }, { "epoch": 19.18856259659969, "grad_norm": 0.9213977456092834, "learning_rate": 2.5076922479657647e-07, "loss": 0.4348, "num_input_tokens_seen": 8357696, "step": 24830 }, { "epoch": 19.19242658423493, "grad_norm": 0.8816602230072021, "learning_rate": 2.483928440193295e-07, "loss": 0.4758, "num_input_tokens_seen": 8359168, "step": 24835 }, { "epoch": 19.19629057187017, "grad_norm": 0.860771894454956, "learning_rate": 2.460277204212519e-07, "loss": 0.5164, "num_input_tokens_seen": 8360768, "step": 24840 }, { "epoch": 19.20015455950541, "grad_norm": 1.8702605962753296, "learning_rate": 2.43673855078011e-07, "loss": 0.4822, "num_input_tokens_seen": 8362528, "step": 24845 }, { "epoch": 19.20401854714065, "grad_norm": 1.060981035232544, "learning_rate": 2.413312490601588e-07, "loss": 0.4765, "num_input_tokens_seen": 8364224, "step": 24850 }, { "epoch": 19.207882534775887, "grad_norm": 1.109319806098938, "learning_rate": 2.3899990343312916e-07, "loss": 0.3362, "num_input_tokens_seen": 8366144, "step": 24855 }, { "epoch": 19.21174652241113, "grad_norm": 0.8879550695419312, "learning_rate": 2.3667981925723226e-07, "loss": 0.3569, "num_input_tokens_seen": 8367936, "step": 24860 }, { "epoch": 19.215610510046368, "grad_norm": 0.7340118885040283, "learning_rate": 2.3437099758765734e-07, "loss": 0.371, "num_input_tokens_seen": 8369696, "step": 24865 }, { "epoch": 19.219474497681606, "grad_norm": 0.6358599066734314, "learning_rate": 2.3207343947446447e-07, "loss": 0.4365, "num_input_tokens_seen": 8371584, "step": 24870 }, { "epoch": 19.223338485316848, "grad_norm": 1.0630885362625122, "learning_rate": 2.2978714596260108e-07, "loss": 0.4191, "num_input_tokens_seen": 8373216, "step": 24875 }, { "epoch": 19.227202472952087, "grad_norm": 0.8347788453102112, "learning_rate": 2.275121180918882e-07, "loss": 0.369, "num_input_tokens_seen": 8374912, "step": 24880 }, { "epoch": 19.231066460587325, "grad_norm": 1.2013741731643677, "learning_rate": 2.2524835689702316e-07, "loss": 0.414, "num_input_tokens_seen": 8376704, "step": 24885 }, { "epoch": 19.234930448222567, "grad_norm": 1.0822832584381104, "learning_rate": 2.229958634075713e-07, "loss": 0.3716, "num_input_tokens_seen": 8378592, "step": 24890 }, { "epoch": 19.238794435857805, "grad_norm": 1.4234052896499634, "learning_rate": 2.207546386479853e-07, "loss": 0.4539, "num_input_tokens_seen": 8380160, "step": 24895 }, { "epoch": 19.242658423493044, "grad_norm": 0.9954608082771301, "learning_rate": 2.1852468363758594e-07, "loss": 0.4116, "num_input_tokens_seen": 8381984, "step": 24900 }, { "epoch": 19.246522411128286, "grad_norm": 0.9018245935440063, "learning_rate": 2.1630599939057306e-07, "loss": 0.4016, "num_input_tokens_seen": 8383584, "step": 24905 }, { "epoch": 19.250386398763524, "grad_norm": 0.8250517249107361, "learning_rate": 2.140985869160145e-07, "loss": 0.3896, "num_input_tokens_seen": 8385280, "step": 24910 }, { "epoch": 19.254250386398763, "grad_norm": 1.1788698434829712, "learning_rate": 2.1190244721785435e-07, "loss": 0.4812, "num_input_tokens_seen": 8387232, "step": 24915 }, { "epoch": 19.258114374034005, "grad_norm": 1.1951433420181274, "learning_rate": 2.0971758129491314e-07, "loss": 0.5606, "num_input_tokens_seen": 8388640, "step": 24920 }, { "epoch": 19.261978361669243, "grad_norm": 1.0497801303863525, "learning_rate": 2.0754399014087933e-07, "loss": 0.3593, "num_input_tokens_seen": 8390304, "step": 24925 }, { "epoch": 19.26584234930448, "grad_norm": 0.6425431370735168, "learning_rate": 2.0538167474431214e-07, "loss": 0.3824, "num_input_tokens_seen": 8391840, "step": 24930 }, { "epoch": 19.269706336939723, "grad_norm": 0.6914058923721313, "learning_rate": 2.0323063608865267e-07, "loss": 0.3627, "num_input_tokens_seen": 8393408, "step": 24935 }, { "epoch": 19.273570324574962, "grad_norm": 0.7393187880516052, "learning_rate": 2.0109087515219894e-07, "loss": 0.4119, "num_input_tokens_seen": 8394944, "step": 24940 }, { "epoch": 19.2774343122102, "grad_norm": 0.9417724013328552, "learning_rate": 1.9896239290813078e-07, "loss": 0.4064, "num_input_tokens_seen": 8396704, "step": 24945 }, { "epoch": 19.28129829984544, "grad_norm": 1.0977853536605835, "learning_rate": 1.9684519032449333e-07, "loss": 0.5251, "num_input_tokens_seen": 8398304, "step": 24950 }, { "epoch": 19.28516228748068, "grad_norm": 2.3979227542877197, "learning_rate": 1.947392683642052e-07, "loss": 0.5316, "num_input_tokens_seen": 8399936, "step": 24955 }, { "epoch": 19.28902627511592, "grad_norm": 0.8987363576889038, "learning_rate": 1.9264462798505023e-07, "loss": 0.405, "num_input_tokens_seen": 8401888, "step": 24960 }, { "epoch": 19.292890262751158, "grad_norm": 1.1874927282333374, "learning_rate": 1.905612701396803e-07, "loss": 0.5924, "num_input_tokens_seen": 8403584, "step": 24965 }, { "epoch": 19.2967542503864, "grad_norm": 1.6465423107147217, "learning_rate": 1.884891957756263e-07, "loss": 0.5856, "num_input_tokens_seen": 8405120, "step": 24970 }, { "epoch": 19.300618238021638, "grad_norm": 0.7061609029769897, "learning_rate": 1.864284058352761e-07, "loss": 0.5231, "num_input_tokens_seen": 8406656, "step": 24975 }, { "epoch": 19.304482225656876, "grad_norm": 1.092799186706543, "learning_rate": 1.8437890125589109e-07, "loss": 0.3358, "num_input_tokens_seen": 8408544, "step": 24980 }, { "epoch": 19.30834621329212, "grad_norm": 1.1419724225997925, "learning_rate": 1.8234068296959506e-07, "loss": 0.3925, "num_input_tokens_seen": 8409888, "step": 24985 }, { "epoch": 19.312210200927357, "grad_norm": 1.0450397729873657, "learning_rate": 1.8031375190338261e-07, "loss": 0.4837, "num_input_tokens_seen": 8411584, "step": 24990 }, { "epoch": 19.316074188562595, "grad_norm": 1.268675684928894, "learning_rate": 1.782981089791136e-07, "loss": 0.6535, "num_input_tokens_seen": 8413440, "step": 24995 }, { "epoch": 19.319938176197837, "grad_norm": 0.6908775568008423, "learning_rate": 1.7629375511351852e-07, "loss": 0.3839, "num_input_tokens_seen": 8415040, "step": 25000 }, { "epoch": 19.323802163833076, "grad_norm": 0.8262147903442383, "learning_rate": 1.7430069121818492e-07, "loss": 0.3685, "num_input_tokens_seen": 8416576, "step": 25005 }, { "epoch": 19.327666151468314, "grad_norm": 1.1569859981536865, "learning_rate": 1.7231891819957657e-07, "loss": 0.6633, "num_input_tokens_seen": 8418080, "step": 25010 }, { "epoch": 19.331530139103556, "grad_norm": 1.4134974479675293, "learning_rate": 1.703484369590086e-07, "loss": 0.3669, "num_input_tokens_seen": 8419648, "step": 25015 }, { "epoch": 19.335394126738795, "grad_norm": 0.8804779648780823, "learning_rate": 1.6838924839266966e-07, "loss": 0.4867, "num_input_tokens_seen": 8421344, "step": 25020 }, { "epoch": 19.339258114374033, "grad_norm": 0.8129786849021912, "learning_rate": 1.664413533916137e-07, "loss": 0.4695, "num_input_tokens_seen": 8423040, "step": 25025 }, { "epoch": 19.343122102009275, "grad_norm": 0.758897066116333, "learning_rate": 1.645047528417487e-07, "loss": 0.53, "num_input_tokens_seen": 8424736, "step": 25030 }, { "epoch": 19.346986089644513, "grad_norm": 0.9875097274780273, "learning_rate": 1.62579447623859e-07, "loss": 0.4573, "num_input_tokens_seen": 8426432, "step": 25035 }, { "epoch": 19.350850077279752, "grad_norm": 1.0029600858688354, "learning_rate": 1.606654386135803e-07, "loss": 0.4314, "num_input_tokens_seen": 8428064, "step": 25040 }, { "epoch": 19.354714064914994, "grad_norm": 0.7858657240867615, "learning_rate": 1.5876272668141902e-07, "loss": 0.3876, "num_input_tokens_seen": 8429728, "step": 25045 }, { "epoch": 19.358578052550232, "grad_norm": 0.9707556366920471, "learning_rate": 1.568713126927357e-07, "loss": 0.3713, "num_input_tokens_seen": 8431200, "step": 25050 }, { "epoch": 19.36244204018547, "grad_norm": 0.6726214289665222, "learning_rate": 1.549911975077617e-07, "loss": 0.3527, "num_input_tokens_seen": 8432768, "step": 25055 }, { "epoch": 19.366306027820713, "grad_norm": 1.0503922700881958, "learning_rate": 1.5312238198157968e-07, "loss": 0.4771, "num_input_tokens_seen": 8434400, "step": 25060 }, { "epoch": 19.37017001545595, "grad_norm": 1.264479160308838, "learning_rate": 1.5126486696414032e-07, "loss": 0.415, "num_input_tokens_seen": 8435936, "step": 25065 }, { "epoch": 19.37403400309119, "grad_norm": 0.8975126147270203, "learning_rate": 1.4941865330025394e-07, "loss": 0.4655, "num_input_tokens_seen": 8437408, "step": 25070 }, { "epoch": 19.377897990726428, "grad_norm": 0.9487967491149902, "learning_rate": 1.475837418295878e-07, "loss": 0.4761, "num_input_tokens_seen": 8439168, "step": 25075 }, { "epoch": 19.38176197836167, "grad_norm": 1.286495327949524, "learning_rate": 1.457601333866715e-07, "loss": 0.4343, "num_input_tokens_seen": 8440832, "step": 25080 }, { "epoch": 19.38562596599691, "grad_norm": 0.6833828687667847, "learning_rate": 1.4394782880089443e-07, "loss": 0.4824, "num_input_tokens_seen": 8442464, "step": 25085 }, { "epoch": 19.389489953632147, "grad_norm": 0.8075204491615295, "learning_rate": 1.4214682889649998e-07, "loss": 0.4097, "num_input_tokens_seen": 8444224, "step": 25090 }, { "epoch": 19.39335394126739, "grad_norm": 0.743929386138916, "learning_rate": 1.403571344925969e-07, "loss": 0.3883, "num_input_tokens_seen": 8445792, "step": 25095 }, { "epoch": 19.397217928902627, "grad_norm": 0.8273429870605469, "learning_rate": 1.3857874640314516e-07, "loss": 0.4288, "num_input_tokens_seen": 8447648, "step": 25100 }, { "epoch": 19.401081916537866, "grad_norm": 1.177614450454712, "learning_rate": 1.3681166543697e-07, "loss": 0.3825, "num_input_tokens_seen": 8449376, "step": 25105 }, { "epoch": 19.404945904173108, "grad_norm": 0.9031030535697937, "learning_rate": 1.3505589239775073e-07, "loss": 0.4366, "num_input_tokens_seen": 8451200, "step": 25110 }, { "epoch": 19.408809891808346, "grad_norm": 0.8706795573234558, "learning_rate": 1.3331142808401808e-07, "loss": 0.4729, "num_input_tokens_seen": 8452992, "step": 25115 }, { "epoch": 19.412673879443584, "grad_norm": 1.404490351676941, "learning_rate": 1.315782732891735e-07, "loss": 0.6178, "num_input_tokens_seen": 8454848, "step": 25120 }, { "epoch": 19.416537867078826, "grad_norm": 0.9135048389434814, "learning_rate": 1.2985642880145864e-07, "loss": 0.372, "num_input_tokens_seen": 8456480, "step": 25125 }, { "epoch": 19.420401854714065, "grad_norm": 1.047866940498352, "learning_rate": 1.2814589540398048e-07, "loss": 0.4088, "num_input_tokens_seen": 8457984, "step": 25130 }, { "epoch": 19.424265842349303, "grad_norm": 0.9565009474754333, "learning_rate": 1.2644667387470276e-07, "loss": 0.45, "num_input_tokens_seen": 8459808, "step": 25135 }, { "epoch": 19.428129829984545, "grad_norm": 0.6914417147636414, "learning_rate": 1.247587649864379e-07, "loss": 0.3843, "num_input_tokens_seen": 8461280, "step": 25140 }, { "epoch": 19.431993817619784, "grad_norm": 1.2043194770812988, "learning_rate": 1.230821695068607e-07, "loss": 0.4798, "num_input_tokens_seen": 8462880, "step": 25145 }, { "epoch": 19.435857805255022, "grad_norm": 1.4900822639465332, "learning_rate": 1.214168881984945e-07, "loss": 0.5034, "num_input_tokens_seen": 8464896, "step": 25150 }, { "epoch": 19.439721792890264, "grad_norm": 0.8687078952789307, "learning_rate": 1.1976292181871684e-07, "loss": 0.3702, "num_input_tokens_seen": 8466848, "step": 25155 }, { "epoch": 19.443585780525503, "grad_norm": 1.0864686965942383, "learning_rate": 1.1812027111976764e-07, "loss": 0.4843, "num_input_tokens_seen": 8468640, "step": 25160 }, { "epoch": 19.44744976816074, "grad_norm": 1.6694220304489136, "learning_rate": 1.1648893684872986e-07, "loss": 0.5279, "num_input_tokens_seen": 8470432, "step": 25165 }, { "epoch": 19.451313755795983, "grad_norm": 1.0892215967178345, "learning_rate": 1.1486891974754332e-07, "loss": 0.4428, "num_input_tokens_seen": 8472064, "step": 25170 }, { "epoch": 19.45517774343122, "grad_norm": 1.0430582761764526, "learning_rate": 1.1326022055300478e-07, "loss": 0.558, "num_input_tokens_seen": 8473952, "step": 25175 }, { "epoch": 19.45904173106646, "grad_norm": 0.8630222082138062, "learning_rate": 1.1166283999675953e-07, "loss": 0.5345, "num_input_tokens_seen": 8475488, "step": 25180 }, { "epoch": 19.462905718701702, "grad_norm": 1.021426796913147, "learning_rate": 1.100767788053042e-07, "loss": 0.4952, "num_input_tokens_seen": 8477152, "step": 25185 }, { "epoch": 19.46676970633694, "grad_norm": 0.8613038063049316, "learning_rate": 1.0850203769998957e-07, "loss": 0.4633, "num_input_tokens_seen": 8479008, "step": 25190 }, { "epoch": 19.47063369397218, "grad_norm": 0.583328902721405, "learning_rate": 1.0693861739701771e-07, "loss": 0.3794, "num_input_tokens_seen": 8480608, "step": 25195 }, { "epoch": 19.474497681607417, "grad_norm": 0.7541795969009399, "learning_rate": 1.0538651860744208e-07, "loss": 0.346, "num_input_tokens_seen": 8482176, "step": 25200 }, { "epoch": 19.47836166924266, "grad_norm": 1.5219910144805908, "learning_rate": 1.0384574203716469e-07, "loss": 0.6185, "num_input_tokens_seen": 8483744, "step": 25205 }, { "epoch": 19.482225656877898, "grad_norm": 1.559664011001587, "learning_rate": 1.0231628838694163e-07, "loss": 0.4796, "num_input_tokens_seen": 8485216, "step": 25210 }, { "epoch": 19.486089644513136, "grad_norm": 0.9467640519142151, "learning_rate": 1.0079815835237761e-07, "loss": 0.3357, "num_input_tokens_seen": 8486816, "step": 25215 }, { "epoch": 19.489953632148378, "grad_norm": 0.9066852927207947, "learning_rate": 9.929135262392586e-08, "loss": 0.4328, "num_input_tokens_seen": 8488608, "step": 25220 }, { "epoch": 19.493817619783616, "grad_norm": 1.3418304920196533, "learning_rate": 9.779587188689099e-08, "loss": 0.5051, "num_input_tokens_seen": 8490560, "step": 25225 }, { "epoch": 19.497681607418855, "grad_norm": 0.7589489221572876, "learning_rate": 9.631171682142893e-08, "loss": 0.4093, "num_input_tokens_seen": 8492320, "step": 25230 }, { "epoch": 19.501545595054097, "grad_norm": 0.5871625542640686, "learning_rate": 9.483888810253582e-08, "loss": 0.337, "num_input_tokens_seen": 8494080, "step": 25235 }, { "epoch": 19.505409582689335, "grad_norm": 1.2701952457427979, "learning_rate": 9.337738640007032e-08, "loss": 0.3656, "num_input_tokens_seen": 8495648, "step": 25240 }, { "epoch": 19.509273570324574, "grad_norm": 0.7331749200820923, "learning_rate": 9.192721237873125e-08, "loss": 0.407, "num_input_tokens_seen": 8497248, "step": 25245 }, { "epoch": 19.513137557959816, "grad_norm": 1.2724276781082153, "learning_rate": 9.048836669806326e-08, "loss": 0.705, "num_input_tokens_seen": 8498976, "step": 25250 }, { "epoch": 19.517001545595054, "grad_norm": 1.2714145183563232, "learning_rate": 8.906085001246233e-08, "loss": 0.4312, "num_input_tokens_seen": 8500736, "step": 25255 }, { "epoch": 19.520865533230292, "grad_norm": 0.8840962648391724, "learning_rate": 8.764466297117302e-08, "loss": 0.4253, "num_input_tokens_seen": 8502592, "step": 25260 }, { "epoch": 19.524729520865534, "grad_norm": 1.6237002611160278, "learning_rate": 8.623980621828842e-08, "loss": 0.4707, "num_input_tokens_seen": 8504736, "step": 25265 }, { "epoch": 19.528593508500773, "grad_norm": 1.4181345701217651, "learning_rate": 8.484628039273912e-08, "loss": 0.4875, "num_input_tokens_seen": 8506464, "step": 25270 }, { "epoch": 19.53245749613601, "grad_norm": 1.0811692476272583, "learning_rate": 8.34640861283098e-08, "loss": 0.5122, "num_input_tokens_seen": 8508128, "step": 25275 }, { "epoch": 19.536321483771253, "grad_norm": 0.9225589632987976, "learning_rate": 8.209322405363929e-08, "loss": 0.4853, "num_input_tokens_seen": 8509920, "step": 25280 }, { "epoch": 19.54018547140649, "grad_norm": 0.7889278531074524, "learning_rate": 8.073369479219551e-08, "loss": 0.5609, "num_input_tokens_seen": 8511392, "step": 25285 }, { "epoch": 19.54404945904173, "grad_norm": 1.0965806245803833, "learning_rate": 7.938549896230329e-08, "loss": 0.3647, "num_input_tokens_seen": 8512896, "step": 25290 }, { "epoch": 19.547913446676972, "grad_norm": 0.950609564781189, "learning_rate": 7.804863717712774e-08, "loss": 0.3461, "num_input_tokens_seen": 8514464, "step": 25295 }, { "epoch": 19.55177743431221, "grad_norm": 1.0795005559921265, "learning_rate": 7.672311004468802e-08, "loss": 0.5623, "num_input_tokens_seen": 8516192, "step": 25300 }, { "epoch": 19.55564142194745, "grad_norm": 0.97816002368927, "learning_rate": 7.540891816783246e-08, "loss": 0.4376, "num_input_tokens_seen": 8517952, "step": 25305 }, { "epoch": 19.55950540958269, "grad_norm": 1.0616703033447266, "learning_rate": 7.410606214427185e-08, "loss": 0.7105, "num_input_tokens_seen": 8519648, "step": 25310 }, { "epoch": 19.56336939721793, "grad_norm": 1.1184676885604858, "learning_rate": 7.281454256654885e-08, "loss": 0.3486, "num_input_tokens_seen": 8521568, "step": 25315 }, { "epoch": 19.567233384853168, "grad_norm": 1.158949613571167, "learning_rate": 7.153436002205472e-08, "loss": 0.3862, "num_input_tokens_seen": 8523232, "step": 25320 }, { "epoch": 19.57109737248841, "grad_norm": 1.252232551574707, "learning_rate": 7.02655150930237e-08, "loss": 0.3969, "num_input_tokens_seen": 8524768, "step": 25325 }, { "epoch": 19.57496136012365, "grad_norm": 0.9118820428848267, "learning_rate": 6.900800835653587e-08, "loss": 0.5206, "num_input_tokens_seen": 8526784, "step": 25330 }, { "epoch": 19.578825347758887, "grad_norm": 1.9001473188400269, "learning_rate": 6.77618403845115e-08, "loss": 0.6394, "num_input_tokens_seen": 8528544, "step": 25335 }, { "epoch": 19.582689335394125, "grad_norm": 1.1134626865386963, "learning_rate": 6.652701174371389e-08, "loss": 0.3693, "num_input_tokens_seen": 8530208, "step": 25340 }, { "epoch": 19.586553323029367, "grad_norm": 0.8742551803588867, "learning_rate": 6.530352299575215e-08, "loss": 0.3864, "num_input_tokens_seen": 8531968, "step": 25345 }, { "epoch": 19.590417310664606, "grad_norm": 1.0719109773635864, "learning_rate": 6.409137469707837e-08, "loss": 0.4581, "num_input_tokens_seen": 8533664, "step": 25350 }, { "epoch": 19.594281298299844, "grad_norm": 1.0305348634719849, "learning_rate": 6.289056739898213e-08, "loss": 0.4, "num_input_tokens_seen": 8535168, "step": 25355 }, { "epoch": 19.598145285935086, "grad_norm": 1.3565808534622192, "learning_rate": 6.170110164759879e-08, "loss": 0.4574, "num_input_tokens_seen": 8536928, "step": 25360 }, { "epoch": 19.602009273570324, "grad_norm": 1.370505690574646, "learning_rate": 6.052297798390116e-08, "loss": 0.512, "num_input_tokens_seen": 8538400, "step": 25365 }, { "epoch": 19.605873261205563, "grad_norm": 0.9713515639305115, "learning_rate": 5.9356196943713415e-08, "loss": 0.3664, "num_input_tokens_seen": 8540256, "step": 25370 }, { "epoch": 19.609737248840805, "grad_norm": 0.8671584129333496, "learning_rate": 5.8200759057688845e-08, "loss": 0.3539, "num_input_tokens_seen": 8541952, "step": 25375 }, { "epoch": 19.613601236476043, "grad_norm": 0.8344293236732483, "learning_rate": 5.705666485132932e-08, "loss": 0.4963, "num_input_tokens_seen": 8543584, "step": 25380 }, { "epoch": 19.61746522411128, "grad_norm": 1.1820238828659058, "learning_rate": 5.5923914844976944e-08, "loss": 0.3893, "num_input_tokens_seen": 8545088, "step": 25385 }, { "epoch": 19.621329211746524, "grad_norm": 0.8930277824401855, "learning_rate": 5.4802509553811274e-08, "loss": 0.4511, "num_input_tokens_seen": 8546816, "step": 25390 }, { "epoch": 19.625193199381762, "grad_norm": 0.8626125454902649, "learning_rate": 5.3692449487857675e-08, "loss": 0.4384, "num_input_tokens_seen": 8548544, "step": 25395 }, { "epoch": 19.629057187017, "grad_norm": 1.1450963020324707, "learning_rate": 5.259373515197341e-08, "loss": 0.7468, "num_input_tokens_seen": 8550400, "step": 25400 }, { "epoch": 19.632921174652243, "grad_norm": 0.6238531470298767, "learning_rate": 5.150636704586431e-08, "loss": 0.372, "num_input_tokens_seen": 8552064, "step": 25405 }, { "epoch": 19.63678516228748, "grad_norm": 0.8532556891441345, "learning_rate": 5.043034566406812e-08, "loss": 0.4002, "num_input_tokens_seen": 8553728, "step": 25410 }, { "epoch": 19.64064914992272, "grad_norm": 0.9796019792556763, "learning_rate": 4.936567149596838e-08, "loss": 0.3647, "num_input_tokens_seen": 8555392, "step": 25415 }, { "epoch": 19.64451313755796, "grad_norm": 1.104549765586853, "learning_rate": 4.8312345025786075e-08, "loss": 0.4833, "num_input_tokens_seen": 8556992, "step": 25420 }, { "epoch": 19.6483771251932, "grad_norm": 1.1190173625946045, "learning_rate": 4.7270366732576896e-08, "loss": 0.4176, "num_input_tokens_seen": 8558432, "step": 25425 }, { "epoch": 19.652241112828438, "grad_norm": 1.0578471422195435, "learning_rate": 4.6239737090242316e-08, "loss": 0.548, "num_input_tokens_seen": 8560160, "step": 25430 }, { "epoch": 19.65610510046368, "grad_norm": 1.8035998344421387, "learning_rate": 4.5220456567515725e-08, "loss": 0.3664, "num_input_tokens_seen": 8562016, "step": 25435 }, { "epoch": 19.65996908809892, "grad_norm": 1.3326096534729004, "learning_rate": 4.421252562797629e-08, "loss": 0.5335, "num_input_tokens_seen": 8563808, "step": 25440 }, { "epoch": 19.663833075734157, "grad_norm": 0.8830553293228149, "learning_rate": 4.321594473003232e-08, "loss": 0.4226, "num_input_tokens_seen": 8565536, "step": 25445 }, { "epoch": 19.667697063369395, "grad_norm": 0.732339084148407, "learning_rate": 4.22307143269407e-08, "loss": 0.4821, "num_input_tokens_seen": 8567328, "step": 25450 }, { "epoch": 19.671561051004637, "grad_norm": 0.7291693687438965, "learning_rate": 4.125683486678189e-08, "loss": 0.6661, "num_input_tokens_seen": 8568864, "step": 25455 }, { "epoch": 19.675425038639876, "grad_norm": 1.1785387992858887, "learning_rate": 4.0294306792490466e-08, "loss": 0.3933, "num_input_tokens_seen": 8570560, "step": 25460 }, { "epoch": 19.679289026275114, "grad_norm": 0.9592893123626709, "learning_rate": 3.934313054182459e-08, "loss": 0.4386, "num_input_tokens_seen": 8572288, "step": 25465 }, { "epoch": 19.683153013910356, "grad_norm": 1.65675950050354, "learning_rate": 3.840330654738544e-08, "loss": 0.5573, "num_input_tokens_seen": 8574112, "step": 25470 }, { "epoch": 19.687017001545595, "grad_norm": 0.8163240551948547, "learning_rate": 3.747483523661166e-08, "loss": 0.3564, "num_input_tokens_seen": 8575680, "step": 25475 }, { "epoch": 19.690880989180833, "grad_norm": 0.7542116641998291, "learning_rate": 3.655771703177935e-08, "loss": 0.3815, "num_input_tokens_seen": 8577440, "step": 25480 }, { "epoch": 19.694744976816075, "grad_norm": 0.9276842474937439, "learning_rate": 3.565195234999652e-08, "loss": 0.4857, "num_input_tokens_seen": 8579072, "step": 25485 }, { "epoch": 19.698608964451314, "grad_norm": 0.7320724129676819, "learning_rate": 3.475754160321143e-08, "loss": 0.4403, "num_input_tokens_seen": 8581024, "step": 25490 }, { "epoch": 19.702472952086552, "grad_norm": 1.1358994245529175, "learning_rate": 3.3874485198207015e-08, "loss": 0.5091, "num_input_tokens_seen": 8582784, "step": 25495 }, { "epoch": 19.706336939721794, "grad_norm": 1.0343358516693115, "learning_rate": 3.3002783536603685e-08, "loss": 0.398, "num_input_tokens_seen": 8584736, "step": 25500 }, { "epoch": 19.710200927357032, "grad_norm": 1.5543203353881836, "learning_rate": 3.214243701485653e-08, "loss": 0.3727, "num_input_tokens_seen": 8586304, "step": 25505 }, { "epoch": 19.71406491499227, "grad_norm": 1.1272510290145874, "learning_rate": 3.129344602425255e-08, "loss": 0.4718, "num_input_tokens_seen": 8588192, "step": 25510 }, { "epoch": 19.717928902627513, "grad_norm": 0.7140488624572754, "learning_rate": 3.045581095092453e-08, "loss": 0.7165, "num_input_tokens_seen": 8589856, "step": 25515 }, { "epoch": 19.72179289026275, "grad_norm": 1.4915485382080078, "learning_rate": 2.9629532175828867e-08, "loss": 0.4209, "num_input_tokens_seen": 8591584, "step": 25520 }, { "epoch": 19.72565687789799, "grad_norm": 1.3859955072402954, "learning_rate": 2.881461007476216e-08, "loss": 0.6293, "num_input_tokens_seen": 8593376, "step": 25525 }, { "epoch": 19.72952086553323, "grad_norm": 0.99944007396698, "learning_rate": 2.8011045018361272e-08, "loss": 0.5584, "num_input_tokens_seen": 8595072, "step": 25530 }, { "epoch": 19.73338485316847, "grad_norm": 0.7900106310844421, "learning_rate": 2.7218837372086636e-08, "loss": 0.3532, "num_input_tokens_seen": 8596832, "step": 25535 }, { "epoch": 19.73724884080371, "grad_norm": 0.745265543460846, "learning_rate": 2.6437987496238935e-08, "loss": 0.4546, "num_input_tokens_seen": 8598528, "step": 25540 }, { "epoch": 19.74111282843895, "grad_norm": 0.9105750322341919, "learning_rate": 2.566849574595631e-08, "loss": 0.4291, "num_input_tokens_seen": 8599936, "step": 25545 }, { "epoch": 19.74497681607419, "grad_norm": 0.7152537107467651, "learning_rate": 2.4910362471208815e-08, "loss": 0.4655, "num_input_tokens_seen": 8601632, "step": 25550 }, { "epoch": 19.748840803709427, "grad_norm": 1.0750882625579834, "learning_rate": 2.4163588016795636e-08, "loss": 0.5188, "num_input_tokens_seen": 8603232, "step": 25555 }, { "epoch": 19.75270479134467, "grad_norm": 0.9158886075019836, "learning_rate": 2.3428172722358977e-08, "loss": 0.366, "num_input_tokens_seen": 8604864, "step": 25560 }, { "epoch": 19.756568778979908, "grad_norm": 0.912071704864502, "learning_rate": 2.270411692237018e-08, "loss": 0.3802, "num_input_tokens_seen": 8606432, "step": 25565 }, { "epoch": 19.760432766615146, "grad_norm": 1.0991923809051514, "learning_rate": 2.1991420946129714e-08, "loss": 0.4289, "num_input_tokens_seen": 8608256, "step": 25570 }, { "epoch": 19.764296754250385, "grad_norm": 0.9003450870513916, "learning_rate": 2.12900851177783e-08, "loss": 0.4844, "num_input_tokens_seen": 8609696, "step": 25575 }, { "epoch": 19.768160741885627, "grad_norm": 1.3833537101745605, "learning_rate": 2.0600109756288565e-08, "loss": 0.5142, "num_input_tokens_seen": 8611232, "step": 25580 }, { "epoch": 19.772024729520865, "grad_norm": 0.6644879579544067, "learning_rate": 1.992149517546227e-08, "loss": 0.4049, "num_input_tokens_seen": 8612896, "step": 25585 }, { "epoch": 19.775888717156104, "grad_norm": 2.7628941535949707, "learning_rate": 1.925424168394141e-08, "loss": 0.5319, "num_input_tokens_seen": 8614656, "step": 25590 }, { "epoch": 19.779752704791346, "grad_norm": 1.0318448543548584, "learning_rate": 1.8598349585197128e-08, "loss": 0.3682, "num_input_tokens_seen": 8616416, "step": 25595 }, { "epoch": 19.783616692426584, "grad_norm": 1.0150880813598633, "learning_rate": 1.7953819177529697e-08, "loss": 0.4046, "num_input_tokens_seen": 8618080, "step": 25600 }, { "epoch": 19.787480680061822, "grad_norm": 1.2418102025985718, "learning_rate": 1.732065075407685e-08, "loss": 0.4255, "num_input_tokens_seen": 8619808, "step": 25605 }, { "epoch": 19.791344667697064, "grad_norm": 1.3765960931777954, "learning_rate": 1.6698844602808238e-08, "loss": 0.4847, "num_input_tokens_seen": 8621344, "step": 25610 }, { "epoch": 19.795208655332303, "grad_norm": 1.467345952987671, "learning_rate": 1.6088401006522647e-08, "loss": 0.439, "num_input_tokens_seen": 8622880, "step": 25615 }, { "epoch": 19.79907264296754, "grad_norm": 1.2693270444869995, "learning_rate": 1.548932024285632e-08, "loss": 0.6122, "num_input_tokens_seen": 8624512, "step": 25620 }, { "epoch": 19.802936630602783, "grad_norm": 0.9323533773422241, "learning_rate": 1.4901602584271868e-08, "loss": 0.4523, "num_input_tokens_seen": 8626432, "step": 25625 }, { "epoch": 19.80680061823802, "grad_norm": 0.7547949552536011, "learning_rate": 1.4325248298069361e-08, "loss": 0.4126, "num_input_tokens_seen": 8627968, "step": 25630 }, { "epoch": 19.81066460587326, "grad_norm": 0.7284120917320251, "learning_rate": 1.3760257646378005e-08, "loss": 0.388, "num_input_tokens_seen": 8629504, "step": 25635 }, { "epoch": 19.814528593508502, "grad_norm": 0.7038025259971619, "learning_rate": 1.3206630886158921e-08, "loss": 0.4411, "num_input_tokens_seen": 8631104, "step": 25640 }, { "epoch": 19.81839258114374, "grad_norm": 0.9515994191169739, "learning_rate": 1.2664368269202365e-08, "loss": 0.389, "num_input_tokens_seen": 8632896, "step": 25645 }, { "epoch": 19.82225656877898, "grad_norm": 0.9772794246673584, "learning_rate": 1.2133470042136052e-08, "loss": 0.3571, "num_input_tokens_seen": 8634528, "step": 25650 }, { "epoch": 19.82612055641422, "grad_norm": 1.3391315937042236, "learning_rate": 1.161393644641129e-08, "loss": 0.5687, "num_input_tokens_seen": 8636096, "step": 25655 }, { "epoch": 19.82998454404946, "grad_norm": 0.6516287922859192, "learning_rate": 1.1105767718319614e-08, "loss": 0.4203, "num_input_tokens_seen": 8637728, "step": 25660 }, { "epoch": 19.833848531684698, "grad_norm": 0.6667765974998474, "learning_rate": 1.0608964088978934e-08, "loss": 0.3984, "num_input_tokens_seen": 8639392, "step": 25665 }, { "epoch": 19.83771251931994, "grad_norm": 1.2333307266235352, "learning_rate": 1.012352578433351e-08, "loss": 0.4175, "num_input_tokens_seen": 8640928, "step": 25670 }, { "epoch": 19.841576506955178, "grad_norm": 1.3471275568008423, "learning_rate": 9.649453025170618e-09, "loss": 0.3835, "num_input_tokens_seen": 8642304, "step": 25675 }, { "epoch": 19.845440494590417, "grad_norm": 0.91947340965271, "learning_rate": 9.186746027095571e-09, "loss": 0.3692, "num_input_tokens_seen": 8644096, "step": 25680 }, { "epoch": 19.84930448222566, "grad_norm": 0.8805829286575317, "learning_rate": 8.73540500055392e-09, "loss": 0.4093, "num_input_tokens_seen": 8646016, "step": 25685 }, { "epoch": 19.853168469860897, "grad_norm": 0.7861018180847168, "learning_rate": 8.295430150814798e-09, "loss": 0.4154, "num_input_tokens_seen": 8647808, "step": 25690 }, { "epoch": 19.857032457496135, "grad_norm": 0.6069735884666443, "learning_rate": 7.866821677984804e-09, "loss": 0.3544, "num_input_tokens_seen": 8649408, "step": 25695 }, { "epoch": 19.860896445131374, "grad_norm": 1.0005277395248413, "learning_rate": 7.449579776996895e-09, "loss": 0.6812, "num_input_tokens_seen": 8651072, "step": 25700 }, { "epoch": 19.864760432766616, "grad_norm": 1.4041367769241333, "learning_rate": 7.043704637613169e-09, "loss": 0.4973, "num_input_tokens_seen": 8652576, "step": 25705 }, { "epoch": 19.868624420401854, "grad_norm": 0.9335018992424011, "learning_rate": 6.6491964444304054e-09, "loss": 0.4299, "num_input_tokens_seen": 8654368, "step": 25710 }, { "epoch": 19.872488408037093, "grad_norm": 0.6767784953117371, "learning_rate": 6.266055376871749e-09, "loss": 0.4449, "num_input_tokens_seen": 8656032, "step": 25715 }, { "epoch": 19.876352395672335, "grad_norm": 1.288535714149475, "learning_rate": 5.894281609195029e-09, "loss": 0.4467, "num_input_tokens_seen": 8657440, "step": 25720 }, { "epoch": 19.880216383307573, "grad_norm": 1.6840230226516724, "learning_rate": 5.533875310478886e-09, "loss": 0.538, "num_input_tokens_seen": 8659488, "step": 25725 }, { "epoch": 19.88408037094281, "grad_norm": 2.091585397720337, "learning_rate": 5.184836644644975e-09, "loss": 0.5946, "num_input_tokens_seen": 8660992, "step": 25730 }, { "epoch": 19.887944358578054, "grad_norm": 1.142185091972351, "learning_rate": 4.847165770435758e-09, "loss": 0.3782, "num_input_tokens_seen": 8662560, "step": 25735 }, { "epoch": 19.891808346213292, "grad_norm": 0.8309038281440735, "learning_rate": 4.52086284142561e-09, "loss": 0.3508, "num_input_tokens_seen": 8664416, "step": 25740 }, { "epoch": 19.89567233384853, "grad_norm": 1.3602311611175537, "learning_rate": 4.205928006018045e-09, "loss": 0.582, "num_input_tokens_seen": 8666304, "step": 25745 }, { "epoch": 19.899536321483772, "grad_norm": 1.3994548320770264, "learning_rate": 3.9023614074484845e-09, "loss": 0.4249, "num_input_tokens_seen": 8667808, "step": 25750 }, { "epoch": 19.90340030911901, "grad_norm": 0.9902821183204651, "learning_rate": 3.6101631837814896e-09, "loss": 0.4608, "num_input_tokens_seen": 8669440, "step": 25755 }, { "epoch": 19.90726429675425, "grad_norm": 1.109973669052124, "learning_rate": 3.32933346790798e-09, "loss": 0.423, "num_input_tokens_seen": 8671232, "step": 25760 }, { "epoch": 19.91112828438949, "grad_norm": 1.0520908832550049, "learning_rate": 3.059872387553564e-09, "loss": 0.6547, "num_input_tokens_seen": 8673440, "step": 25765 }, { "epoch": 19.91499227202473, "grad_norm": 0.8043015003204346, "learning_rate": 2.8017800652702097e-09, "loss": 0.3459, "num_input_tokens_seen": 8675072, "step": 25770 }, { "epoch": 19.918856259659968, "grad_norm": 0.9794676303863525, "learning_rate": 2.5550566184390224e-09, "loss": 0.4803, "num_input_tokens_seen": 8676896, "step": 25775 }, { "epoch": 19.92272024729521, "grad_norm": 1.0088192224502563, "learning_rate": 2.3197021592730185e-09, "loss": 0.4633, "num_input_tokens_seen": 8678464, "step": 25780 }, { "epoch": 19.92658423493045, "grad_norm": 1.5148600339889526, "learning_rate": 2.095716794811575e-09, "loss": 0.6559, "num_input_tokens_seen": 8680064, "step": 25785 }, { "epoch": 19.930448222565687, "grad_norm": 0.9922068119049072, "learning_rate": 1.883100626925982e-09, "loss": 0.3856, "num_input_tokens_seen": 8681760, "step": 25790 }, { "epoch": 19.93431221020093, "grad_norm": 1.237323522567749, "learning_rate": 1.6818537523111134e-09, "loss": 0.4179, "num_input_tokens_seen": 8683552, "step": 25795 }, { "epoch": 19.938176197836167, "grad_norm": 1.2627619504928589, "learning_rate": 1.491976262499306e-09, "loss": 0.4656, "num_input_tokens_seen": 8685280, "step": 25800 }, { "epoch": 19.942040185471406, "grad_norm": 0.7940593957901001, "learning_rate": 1.3134682438492585e-09, "loss": 0.3768, "num_input_tokens_seen": 8687072, "step": 25805 }, { "epoch": 19.945904173106648, "grad_norm": 1.1620961427688599, "learning_rate": 1.1463297775432535e-09, "loss": 0.5661, "num_input_tokens_seen": 8689088, "step": 25810 }, { "epoch": 19.949768160741886, "grad_norm": 0.8556297421455383, "learning_rate": 9.905609395982617e-10, "loss": 0.502, "num_input_tokens_seen": 8690624, "step": 25815 }, { "epoch": 19.953632148377125, "grad_norm": 0.5333364009857178, "learning_rate": 8.4616180086039e-10, "loss": 0.4084, "num_input_tokens_seen": 8692192, "step": 25820 }, { "epoch": 19.957496136012363, "grad_norm": 1.0248669385910034, "learning_rate": 7.1313242699933e-10, "loss": 0.4224, "num_input_tokens_seen": 8693952, "step": 25825 }, { "epoch": 19.961360123647605, "grad_norm": 1.1239548921585083, "learning_rate": 5.914728785250123e-10, "loss": 0.4283, "num_input_tokens_seen": 8695648, "step": 25830 }, { "epoch": 19.965224111282843, "grad_norm": 0.8463932275772095, "learning_rate": 4.811832107598502e-10, "loss": 0.385, "num_input_tokens_seen": 8697440, "step": 25835 }, { "epoch": 19.969088098918082, "grad_norm": 0.6409337520599365, "learning_rate": 3.8226347387204654e-10, "loss": 0.4856, "num_input_tokens_seen": 8698912, "step": 25840 }, { "epoch": 19.972952086553324, "grad_norm": 1.4500172138214111, "learning_rate": 2.9471371284783834e-10, "loss": 0.453, "num_input_tokens_seen": 8700640, "step": 25845 }, { "epoch": 19.976816074188562, "grad_norm": 0.8400689959526062, "learning_rate": 2.185339675025988e-10, "loss": 0.3828, "num_input_tokens_seen": 8702560, "step": 25850 }, { "epoch": 19.9806800618238, "grad_norm": 0.6658493280410767, "learning_rate": 1.5372427248638853e-10, "loss": 0.329, "num_input_tokens_seen": 8704352, "step": 25855 }, { "epoch": 19.984544049459043, "grad_norm": 0.9370764493942261, "learning_rate": 1.0028465727562885e-10, "loss": 0.3818, "num_input_tokens_seen": 8706176, "step": 25860 }, { "epoch": 19.98840803709428, "grad_norm": 0.9386910796165466, "learning_rate": 5.821514617587731e-11, "loss": 0.4014, "num_input_tokens_seen": 8707936, "step": 25865 }, { "epoch": 19.99227202472952, "grad_norm": 1.1661734580993652, "learning_rate": 2.751575831627662e-11, "loss": 0.4466, "num_input_tokens_seen": 8709504, "step": 25870 }, { "epoch": 19.99613601236476, "grad_norm": 0.8374985456466675, "learning_rate": 8.186507660656873e-12, "loss": 0.4297, "num_input_tokens_seen": 8710912, "step": 25875 }, { "epoch": 20.0, "grad_norm": 3.0246078968048096, "learning_rate": 2.2740300198442753e-13, "loss": 0.5761, "num_input_tokens_seen": 8712528, "step": 25880 }, { "epoch": 20.0, "eval_loss": 0.46312153339385986, "eval_runtime": 6.3718, "eval_samples_per_second": 90.241, "eval_steps_per_second": 22.599, "num_input_tokens_seen": 8712528, "step": 25880 }, { "epoch": 20.0, "num_input_tokens_seen": 8712528, "step": 25880, "total_flos": 3.9233147577237504e+17, "train_loss": 0.6318969237610951, "train_runtime": 3960.2188, "train_samples_per_second": 26.13, "train_steps_per_second": 6.535 } ], "logging_steps": 5, "max_steps": 25880, "num_input_tokens_seen": 8712528, "num_train_epochs": 20, "save_steps": 1294, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.9233147577237504e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }